[6/6] incubator-impala git commit: IMPALA-4397, IMPALA-3259: reduce codegen time and memory

kwho Wed, 23 Nov 2016 00:26:22 -0800

IMPALA-4397,IMPALA-3259: reduce codegen time and memory

A handful of fixes to codegen memory usage:
* Delete the IR module when we're done with it (it can be fairly large)
* Track the compiled code size (typically not that large, but it can add
  up if there are many fragments).
* Estimate optimisation memory requirements and track it in the memory
  tracker. This is very crude but much better than not tracking it.


A handful of fixes to improve codegen time/cost, particularly targeted
at compute stats workloads:
* Avoid over-inlining when there are many aggregate functions,
  conjuncts, etc by adding "NoInline" attributes.
* Don't codegen non-grouping merge aggregations. They will only process
  one row per Impala daemon, so codegen is not worth it.
* Make the Hll algorithm more efficient by specialising the hash function
  based on decimal width.

Limitations:
* This doesn't tackle over-inlining of large expr trees, but a similar
  approach will be used there in a follow-on patch.

Perf:
Compute stats on functional_parquet.widetable_1000_cols goes from 1min+
of codegen to ~ 5s codegen on my machine. Local perf runs of tpc-h
and targeted perf showed no regressions and some moderate improvements
(1-2%).

Also did an experiment to understand the perf consequences of disabling
inlining. I manually set CODEGEN_INLINE_EXPRS_THRESHOLD to 0, and ran:

  drop stats tpch_20_parquet.lineitem
  compute stats tpch_20_parquet.lineitem;

There was no difference in time spent in the agg node: 30.7s with
inlining, 30.5s without.

Change-Id: Id10015b49da182cb181a653ac8464b4a18b71091
Reviewed-on: http://gerrit.cloudera.org:8080/4956
Reviewed-by: Tim Armstrong <[email protected]>
Reviewed-by: Marcel Kornacker <[email protected]>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/4db330e6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/4db330e6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/4db330e6

Branch: refs/heads/master
Commit: 4db330e69a2dbb4a23f46e34b484da0d6b9ef29b
Parents: 696fb68
Author: Tim Armstrong <[email protected]>
Authored: Tue Oct 4 17:09:17 2016 -0700
Committer: Internal Jenkins <[email protected]>
Committed: Wed Nov 23 08:18:17 2016 +0000

----------------------------------------------------------------------
 be/src/benchmarks/hash-benchmark.cc             |    2 +-
 be/src/codegen/llvm-codegen-test.cc             |   40 +-
 be/src/codegen/llvm-codegen.cc                  |  114 +-
 be/src/codegen/llvm-codegen.h                   |   84 +-
 be/src/codegen/mcjit-mem-mgr.h                  |   38 +-
 be/src/exec/aggregation-node.cc                 |    4 +-
 be/src/exec/exchange-node.cc                    |    4 +-
 be/src/exec/exec-node.cc                        |   17 +-
 be/src/exec/exec-node.h                         |    7 +
 be/src/exec/hash-join-node.cc                   |   17 +-
 be/src/exec/hash-table.cc                       |   41 +-
 be/src/exec/hdfs-scan-node-base.cc              |    5 +-
 be/src/exec/hdfs-scanner.cc                     |    9 +
 be/src/exec/partitioned-aggregation-node.cc     |   32 +-
 be/src/exec/partitioned-aggregation-node.h      |    2 +-
 be/src/exec/partitioned-hash-join-node.cc       |   11 +-
 be/src/exec/sort-node.cc                        |    9 +-
 be/src/exec/topn-node.cc                        |    4 +-
 be/src/exprs/aggregate-functions-ir.cc          |   22 +-
 be/src/exprs/anyval-util.h                      |   19 +-
 be/src/exprs/expr-codegen-test.cc               |    3 +-
 be/src/exprs/expr.h                             |    1 +
 be/src/runtime/lib-cache.cc                     |    4 +-
 be/src/runtime/runtime-state.cc                 |   14 +-
 common/thrift/PlanNodes.thrift                  |   36 +-
 .../impala/planner/DistributedPlanner.java      |    6 +
 .../org/apache/impala/planner/PlanNode.java     |   12 +-
 .../queries/QueryTest/compute-stats.test        | 1016 ++++++++++++++++++
 tests/query_test/test_aggregation.py            |   51 +-
 29 files changed, 1479 insertions(+), 145 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/benchmarks/hash-benchmark.cc
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/hash-benchmark.cc 
b/be/src/benchmarks/hash-benchmark.cc
index 4f6d7f1..125310b 100644
--- a/be/src/benchmarks/hash-benchmark.cc
+++ b/be/src/benchmarks/hash-benchmark.cc
@@ -429,7 +429,7 @@ int main(int argc, char **argv) {
 
   Status status;
   scoped_ptr<LlvmCodeGen> codegen;
-  status = LlvmCodeGen::CreateImpalaCodegen(&obj_pool, "test", &codegen);
+  status = LlvmCodeGen::CreateImpalaCodegen(&obj_pool, NULL, "test", &codegen);
   if (!status.ok()) {
     cout << "Could not start codegen.";
     return -1;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/codegen/llvm-codegen-test.cc
----------------------------------------------------------------------
diff --git a/be/src/codegen/llvm-codegen-test.cc 
b/be/src/codegen/llvm-codegen-test.cc
index 178baa2..1238dde 100644
--- a/be/src/codegen/llvm-codegen-test.cc
+++ b/be/src/codegen/llvm-codegen-test.cc
@@ -42,9 +42,9 @@ class LlvmCodeGenTest : public testing:: Test {
     ObjectPool pool;
     Status status;
     for (int i = 0; i < 10; ++i) {
-      LlvmCodeGen object1(&pool, "Test");
-      LlvmCodeGen object2(&pool, "Test");
-      LlvmCodeGen object3(&pool, "Test");
+      LlvmCodeGen object1(&pool, NULL, "Test");
+      LlvmCodeGen object2(&pool, NULL, "Test");
+      LlvmCodeGen object3(&pool, NULL, "Test");
 
       ASSERT_OK(object1.Init(unique_ptr<Module>(new Module("Test", 
object1.context()))));
       ASSERT_OK(object2.Init(unique_ptr<Module>(new Module("Test", 
object2.context()))));
@@ -53,16 +53,16 @@ class LlvmCodeGenTest : public testing:: Test {
   }
 
   // Wrapper to call private test-only methods on LlvmCodeGen object
-  static Status CreateFromFile(ObjectPool* pool, const string& filename,
-      scoped_ptr<LlvmCodeGen>* codegen) {
-    return LlvmCodeGen::CreateFromFile(pool, filename, "test", codegen);
+  static Status CreateFromFile(
+      ObjectPool* pool, const string& filename, scoped_ptr<LlvmCodeGen>* 
codegen) {
+    return LlvmCodeGen::CreateFromFile(pool, NULL, filename, "test", codegen);
   }
 
   static LlvmCodeGen* CreateCodegen(ObjectPool* pool) {
-    LlvmCodeGen* codegen = pool->Add(new LlvmCodeGen(pool, "Test"));
+    LlvmCodeGen* codegen = pool->Add(new LlvmCodeGen(pool, NULL, "Test"));
     if (codegen != NULL) {
-      Status status = codegen->Init(
-          unique_ptr<Module>(new Module("Test", codegen->context())));
+      Status status =
+          codegen->Init(unique_ptr<Module>(new Module("Test", 
codegen->context())));
       if (!status.ok()) return NULL;
     }
     return codegen;
@@ -82,10 +82,7 @@ class LlvmCodeGenTest : public testing:: Test {
     return codegen->VerifyFunction(fn);
   }
 
-  static Status FinalizeModule(LlvmCodeGen* codegen) {
-    return codegen->FinalizeModule();
-  }
-
+  static Status FinalizeModule(LlvmCodeGen* codegen) { return 
codegen->FinalizeModule(); }
 };
 
 // Simple test to just make and destroy llvmcodegen objects.  LLVM
@@ -109,7 +106,8 @@ TEST_F(LlvmCodeGenTest, BadIRFile) {
   ObjectPool pool;
   string module_file = "NonExistentFile.ir";
   scoped_ptr<LlvmCodeGen> codegen;
-  EXPECT_FALSE(LlvmCodeGenTest::CreateFromFile(&pool, module_file.c_str(), 
&codegen).ok());
+  EXPECT_FALSE(
+      LlvmCodeGenTest::CreateFromFile(&pool, module_file.c_str(), 
&codegen).ok());
 }
 
 // IR for the generated linner loop
@@ -289,7 +287,7 @@ TEST_F(LlvmCodeGenTest, StringValue) {
   ObjectPool pool;
 
   scoped_ptr<LlvmCodeGen> codegen;
-  ASSERT_OK(LlvmCodeGen::CreateImpalaCodegen(&pool, "test", &codegen));
+  ASSERT_OK(LlvmCodeGen::CreateImpalaCodegen(&pool, NULL, "test", &codegen));
   EXPECT_TRUE(codegen.get() != NULL);
 
   string str("Test");
@@ -332,7 +330,7 @@ TEST_F(LlvmCodeGenTest, MemcpyTest) {
   ObjectPool pool;
 
   scoped_ptr<LlvmCodeGen> codegen;
-  ASSERT_OK(LlvmCodeGen::CreateImpalaCodegen(&pool, "test", &codegen));
+  ASSERT_OK(LlvmCodeGen::CreateImpalaCodegen(&pool, NULL, "test", &codegen));
   ASSERT_TRUE(codegen.get() != NULL);
 
   LlvmCodeGen::FnPrototype prototype(codegen.get(), "MemcpyTest", 
codegen->void_type());
@@ -379,13 +377,13 @@ TEST_F(LlvmCodeGenTest, HashTest) {
   // Loop to test both the sse4 on/off paths
   for (int i = 0; i < 2; ++i) {
     scoped_ptr<LlvmCodeGen> codegen;
-    ASSERT_OK(LlvmCodeGen::CreateImpalaCodegen(&pool, "test", &codegen));
+    ASSERT_OK(LlvmCodeGen::CreateImpalaCodegen(&pool, NULL, "test", &codegen));
     ASSERT_TRUE(codegen.get() != NULL);
 
-    Value* llvm_data1 = codegen->CastPtrToLlvmPtr(codegen->ptr_type(),
-        const_cast<char*>(data1));
-    Value* llvm_data2 = codegen->CastPtrToLlvmPtr(codegen->ptr_type(),
-        const_cast<char*>(data2));
+    Value* llvm_data1 =
+        codegen->CastPtrToLlvmPtr(codegen->ptr_type(), 
const_cast<char*>(data1));
+    Value* llvm_data2 =
+        codegen->CastPtrToLlvmPtr(codegen->ptr_type(), 
const_cast<char*>(data2));
     Value* llvm_len1 = codegen->GetIntConstant(TYPE_INT, strlen(data1));
     Value* llvm_len2 = codegen->GetIntConstant(TYPE_INT, strlen(data2));
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/codegen/llvm-codegen.cc
----------------------------------------------------------------------
diff --git a/be/src/codegen/llvm-codegen.cc b/be/src/codegen/llvm-codegen.cc
index f6e7e97..596f48a 100644
--- a/be/src/codegen/llvm-codegen.cc
+++ b/be/src/codegen/llvm-codegen.cc
@@ -25,9 +25,9 @@
 #include <gutil/strings/substitute.h>
 
 #include <llvm/ADT/Triple.h>
-#include <llvm/Analysis/TargetTransformInfo.h>
 #include <llvm/Analysis/InstructionSimplify.h>
 #include <llvm/Analysis/Passes.h>
+#include <llvm/Analysis/TargetTransformInfo.h>
 #include <llvm/Bitcode/ReaderWriter.h>
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
 #include <llvm/ExecutionEngine/MCJIT.h>
@@ -52,17 +52,18 @@
 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
 #include <llvm/Transforms/Utils/Cloning.h>
 
-#include "common/logging.h"
 #include "codegen/codegen-anyval.h"
 #include "codegen/codegen-symbol-emitter.h"
 #include "codegen/impala-ir-data.h"
 #include "codegen/instruction-counter.h"
 #include "codegen/mcjit-mem-mgr.h"
+#include "common/logging.h"
 #include "impala-ir/impala-ir-names.h"
 #include "runtime/descriptors.h"
 #include "runtime/hdfs-fs-cache.h"
 #include "runtime/lib-cache.h"
 #include "runtime/mem-pool.h"
+#include "runtime/mem-tracker.h"
 #include "runtime/string-value.h"
 #include "runtime/timestamp-value.h"
 #include "util/cpu-info.h"
@@ -76,7 +77,7 @@
 using namespace llvm;
 using namespace strings;
 using std::fstream;
-using std::unique_ptr;
+using std::move;
 
 DEFINE_bool(print_llvm_ir_instruction_count, false,
     "if true, prints the instruction counts of all JIT'd functions");
@@ -200,7 +201,7 @@ void LlvmCodeGen::InitializeLlvm(bool load_backend) {
 
   ObjectPool init_pool;
   scoped_ptr<LlvmCodeGen> init_codegen;
-  Status status = LlvmCodeGen::CreateFromMemory(&init_pool, "init", 
&init_codegen);
+  Status status = LlvmCodeGen::CreateFromMemory(&init_pool, NULL, "init", 
&init_codegen);
   ParseGVForFunctions(init_codegen->module_, &gv_ref_ir_fns_);
 
   // Validate the module by verifying that functions for all IRFunction::Type
@@ -213,16 +214,18 @@ void LlvmCodeGen::InitializeLlvm(bool load_backend) {
   }
 }
 
-LlvmCodeGen::LlvmCodeGen(ObjectPool* pool, const string& id) :
-  id_(id),
-  profile_(pool, "CodeGen"),
-  optimizations_enabled_(false),
-  is_corrupt_(false),
-  is_compiled_(false),
-  context_(new llvm::LLVMContext()),
-  module_(NULL),
-  loaded_functions_(IRFunction::FN_END, NULL) {
-
+LlvmCodeGen::LlvmCodeGen(
+    ObjectPool* pool, MemTracker* parent_mem_tracker, const string& id)
+  : id_(id),
+    profile_(pool, "CodeGen"),
+    mem_tracker_(new MemTracker(&profile_, -1, "CodeGen", parent_mem_tracker)),
+    optimizations_enabled_(false),
+    is_corrupt_(false),
+    is_compiled_(false),
+    context_(new llvm::LLVMContext()),
+    module_(NULL),
+    memory_manager_(NULL),
+    loaded_functions_(IRFunction::FN_END, NULL) {
   DCHECK(llvm_initialized_) << "Must call LlvmCodeGen::InitializeLlvm first.";
 
   load_module_timer_ = ADD_TIMER(&profile_, "LoadTime");
@@ -235,9 +238,9 @@ LlvmCodeGen::LlvmCodeGen(ObjectPool* pool, const string& 
id) :
   num_instructions_ = ADD_COUNTER(&profile_, "NumInstructions", TUnit::UNIT);
 }
 
-Status LlvmCodeGen::CreateFromFile(ObjectPool* pool,
+Status LlvmCodeGen::CreateFromFile(ObjectPool* pool, MemTracker* 
parent_mem_tracker,
     const string& file, const string& id, scoped_ptr<LlvmCodeGen>* codegen) {
-  codegen->reset(new LlvmCodeGen(pool, id));
+  codegen->reset(new LlvmCodeGen(pool, parent_mem_tracker, id));
   SCOPED_TIMER((*codegen)->profile_.total_time_counter());
 
   unique_ptr<Module> loaded_module;
@@ -246,9 +249,9 @@ Status LlvmCodeGen::CreateFromFile(ObjectPool* pool,
   return (*codegen)->Init(std::move(loaded_module));
 }
 
-Status LlvmCodeGen::CreateFromMemory(ObjectPool* pool, const string& id,
-    scoped_ptr<LlvmCodeGen>* codegen) {
-  codegen->reset(new LlvmCodeGen(pool, id));
+Status LlvmCodeGen::CreateFromMemory(ObjectPool* pool, MemTracker* 
parent_mem_tracker,
+    const string& id, scoped_ptr<LlvmCodeGen>* codegen) {
+  codegen->reset(new LlvmCodeGen(pool, parent_mem_tracker, id));
   SCOPED_TIMER((*codegen)->profile_.total_time_counter());
 
   // Select the appropriate IR version. We cannot use LLVM IR with SSE4.2 
instructions on
@@ -375,9 +378,9 @@ void LlvmCodeGen::StripGlobalCtorsDtors(llvm::Module* 
module) {
   if (destructors != NULL) destructors->eraseFromParent();
 }
 
-Status LlvmCodeGen::CreateImpalaCodegen(
-    ObjectPool* pool, const string& id, scoped_ptr<LlvmCodeGen>* codegen_ret) {
-  RETURN_IF_ERROR(CreateFromMemory(pool, id, codegen_ret));
+Status LlvmCodeGen::CreateImpalaCodegen(ObjectPool* pool, MemTracker* 
parent_mem_tracker,
+    const string& id, scoped_ptr<LlvmCodeGen>* codegen_ret) {
+  RETURN_IF_ERROR(CreateFromMemory(pool, parent_mem_tracker, id, codegen_ret));
   LlvmCodeGen* codegen = codegen_ret->get();
 
   // Parse module for cross compiled functions and types
@@ -422,8 +425,9 @@ Status LlvmCodeGen::Init(unique_ptr<Module> module) {
   EngineBuilder builder(std::move(module));
   builder.setEngineKind(EngineKind::JIT);
   builder.setOptLevel(opt_level);
-  builder.setMCJITMemoryManager(
-      unique_ptr<ImpalaMCJITMemoryManager>(new ImpalaMCJITMemoryManager()));
+  unique_ptr<ImpalaMCJITMemoryManager> memory_manager(new 
ImpalaMCJITMemoryManager);
+  memory_manager_ = memory_manager.get();
+  builder.setMCJITMemoryManager(move(memory_manager));
   builder.setMCPU(cpu_name_);
   builder.setMAttrs(cpu_attrs_);
   builder.setErrorStr(&error_string_);
@@ -464,6 +468,10 @@ void LlvmCodeGen::SetupJITListeners() {
 }
 
 LlvmCodeGen::~LlvmCodeGen() {
+  if (memory_manager_ != NULL) 
mem_tracker_->Release(memory_manager_->bytes_tracked());
+  if (mem_tracker_->parent() != NULL) mem_tracker_->UnregisterFromParent();
+  mem_tracker_.reset();
+
   // Execution engine executes callback on event listener, so tear down engine 
first.
   execution_engine_.reset();
   symbol_emitter_.reset();
@@ -753,8 +761,14 @@ bool LlvmCodeGen::VerifyFunction(Function* fn) {
   return true;
 }
 
-LlvmCodeGen::FnPrototype::FnPrototype(LlvmCodeGen* codegen, const string& name,
-    Type* ret_type) : codegen_(codegen), name_(name), ret_type_(ret_type) {
+void LlvmCodeGen::SetNoInline(llvm::Function* function) const {
+  function->removeFnAttr(llvm::Attribute::AlwaysInline);
+  function->addFnAttr(llvm::Attribute::NoInline);
+}
+
+LlvmCodeGen::FnPrototype::FnPrototype(
+    LlvmCodeGen* codegen, const string& name, Type* ret_type)
+  : codegen_(codegen), name_(name), ret_type_(ret_type) {
   DCHECK(!codegen_->is_compiled_) << "Not valid to add additional functions";
 }
 
@@ -927,10 +941,15 @@ Status LlvmCodeGen::FinalizeModule() {
 
   // Don't waste time optimizing module if there are no functions to JIT. This 
can happen
   // if the codegen object is created but no functions are successfully 
codegen'd.
-  if (fns_to_jit_compile_.empty()) return Status::OK();
+  if (fns_to_jit_compile_.empty()) {
+    DestroyModule();
+    return Status::OK();
+  }
 
   RETURN_IF_ERROR(FinalizeLazyMaterialization());
-  if (optimizations_enabled_ && !FLAGS_disable_optimization_passes) 
OptimizeModule();
+  if (optimizations_enabled_ && !FLAGS_disable_optimization_passes) {
+    RETURN_IF_ERROR(OptimizeModule());
+  }
 
   if (FLAGS_opt_module_dir.size() != 0) {
     string path = FLAGS_opt_module_dir + "/" + id_ + "_opt.ll";
@@ -949,17 +968,28 @@ Status LlvmCodeGen::FinalizeModule() {
     execution_engine_->finalizeObject();
   }
 
-  // Get pointers to all codegen'd functions.
+  // Get pointers to all codegen'd functions
   for (int i = 0; i < fns_to_jit_compile_.size(); ++i) {
     Function* function = fns_to_jit_compile_[i].first;
     void* jitted_function = execution_engine_->getPointerToFunction(function);
     DCHECK(jitted_function != NULL) << "Failed to jit " << 
function->getName().data();
     *fns_to_jit_compile_[i].second = jitted_function;
   }
+
+  DestroyModule();
+
+  // Track the memory consumed by the compiled code.
+  int64_t bytes_allocated = memory_manager_->bytes_allocated();
+  if (!mem_tracker_->TryConsume(bytes_allocated)) {
+    const string& msg = Substitute(
+        "Failed to allocate '$0' bytes for compiled code module", 
bytes_allocated);
+    return mem_tracker_->MemLimitExceeded(NULL, msg, bytes_allocated);
+  }
+  memory_manager_->set_bytes_tracked(bytes_allocated);
   return Status::OK();
 }
 
-void LlvmCodeGen::OptimizeModule() {
+Status LlvmCodeGen::OptimizeModule() {
   SCOPED_TIMER(optimization_timer_);
 
   // This pass manager will construct optimizations passes that are "typical" 
for
@@ -1003,6 +1033,14 @@ void LlvmCodeGen::OptimizeModule() {
   COUNTER_SET(num_functions_, 
counter.GetCount(InstructionCounter::TOTAL_FUNCTIONS));
   COUNTER_SET(num_instructions_, 
counter.GetCount(InstructionCounter::TOTAL_INSTS));
 
+  int64_t estimated_memory = ESTIMATED_OPTIMIZER_BYTES_PER_INST
+      * counter.GetCount(InstructionCounter::TOTAL_INSTS);
+  if (!mem_tracker_->TryConsume(estimated_memory)) {
+    const string& msg = Substitute(
+        "Codegen failed to reserve '$0' bytes for optimization", 
estimated_memory);
+    return mem_tracker_->MemLimitExceeded(NULL, msg, estimated_memory);
+  }
+
   // Create and run function pass manager
   unique_ptr<legacy::FunctionPassManager> fn_pass_manager(
       new legacy::FunctionPassManager(module_));
@@ -1027,6 +1065,22 @@ void LlvmCodeGen::OptimizeModule() {
       VLOG(1) << counter.PrintCounters();
     }
   }
+
+  mem_tracker_->Release(estimated_memory);
+  return Status::OK();
+}
+
+void LlvmCodeGen::DestroyModule() {
+  // Clear all references to LLVM objects owned by the module.
+  loaded_functions_.clear();
+  codegend_functions_.clear();
+  registered_exprs_map_.clear();
+  registered_exprs_.clear();
+  llvm_intrinsics_.clear();
+  hash_fns_.clear();
+  fns_to_jit_compile_.clear();
+  execution_engine_->removeModule(module_);
+  module_ = NULL;
 }
 
 void LlvmCodeGen::AddFunctionToJit(Function* fn, void** fn_ptr) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/codegen/llvm-codegen.h
----------------------------------------------------------------------
diff --git a/be/src/codegen/llvm-codegen.h b/be/src/codegen/llvm-codegen.h
index 21f9f71..8853d7e 100644
--- a/be/src/codegen/llvm-codegen.h
+++ b/be/src/codegen/llvm-codegen.h
@@ -72,6 +72,7 @@ namespace llvm {
 namespace impala {
 
 class CodegenSymbolEmitter;
+class ImpalaMCJITMemoryManager;
 class SubExprElimination;
 class TupleDescriptor;
 
@@ -124,6 +125,14 @@ class LlvmBuilder : public llvm::IRBuilder<> {
 /// instructions attached to the function object. Functions reachable by the 
function
 /// are also materialized recursively.
 //
+/// Memory used for codegen is tracked via the MemTracker hierarchy. Codegen 
can use
+/// significant memory for the IR module and for the optimization and 
compilation
+/// algorithms. LLVM provides no way to directly track this transient memory - 
instead
+/// the memory consumption is estimated based on the size of the IR module and 
released
+/// once compilation finishes. Once compilation finishes, the size of the 
compiled
+/// machine code is obtained from LLVM and and is tracked until the 
LlvmCodeGen object
+/// is torn down and the compiled code is freed.
+//
 class LlvmCodeGen {
  public:
   /// This function must be called once per process before any llvm API calls 
are
@@ -138,13 +147,15 @@ class LlvmCodeGen {
 
   /// Creates a codegen instance for Impala initialized with the 
cross-compiled Impala IR.
   /// 'codegen' will contain the created object on success.
+  /// 'parent_mem_tracker' - if non-NULL, the CodeGen MemTracker is created 
under this.
   /// 'id' is used for outputting the IR module for debugging.
-  static Status CreateImpalaCodegen(
-      ObjectPool*, const std::string& id, boost::scoped_ptr<LlvmCodeGen>* 
codegen);
+  static Status CreateImpalaCodegen(ObjectPool*, MemTracker* 
parent_mem_tracker,
+      const std::string& id, boost::scoped_ptr<LlvmCodeGen>* codegen);
 
   /// Creates a LlvmCodeGen instance initialized with the module bitcode from 
'file'.
   /// 'codegen' will contain the created object on success.
-  static Status CreateFromFile(ObjectPool*, const std::string& file, const 
std::string& id,
+  static Status CreateFromFile(ObjectPool*, MemTracker* parent_mem_tracker,
+      const std::string& file, const std::string& id,
       boost::scoped_ptr<LlvmCodeGen>* codegen);
 
   /// Removes all jit compiled dynamically linked functions from the process.
@@ -202,8 +213,8 @@ class LlvmCodeGen {
     /// If 'print_ir' is true, the generated llvm::Function's IR will be 
printed when
     /// GetIR() is called. Avoid doing so for IR function prototypes generated 
for
     /// externally defined native function.
-    llvm::Function* GeneratePrototype(LlvmBuilder* builder = NULL,
-        llvm::Value** params = NULL, bool print_ir = true);
+    llvm::Function* GeneratePrototype(
+        LlvmBuilder* builder = NULL, llvm::Value** params = NULL, bool 
print_ir = true);
 
    private:
     friend class LlvmCodeGen;
@@ -266,7 +277,9 @@ class LlvmCodeGen {
 
   /// Optimize and compile the module. This should be called after all 
functions to JIT
   /// have been added to the module via AddFunctionToJit(). If 
optimizations_enabled_ is
-  /// false, the module will not be optimized before compilation.
+  /// false, the module will not be optimized before compilation. After 
FinalizeModule()
+  /// is called, the LLVM module is destroyed and it is invalid to call any 
LlvmCodegen
+  /// functions.
   Status FinalizeModule();
 
   /// Replaces all instructions in 'caller' that call 'target_name' with a call
@@ -357,6 +370,10 @@ class LlvmCodeGen {
   llvm::Function* GetFnvHashFunction(int num_bytes = -1);
   llvm::Function* GetMurmurHashFunction(int num_bytes = -1);
 
+  /// Set the NoInline attribute on 'function' and remove the AlwaysInline 
attribute if
+  /// present.
+  void SetNoInline(llvm::Function* function) const;
+
   /// Allocate stack storage for local variables.  This is similar to 
traditional c, where
   /// all the variables must be declared at the top of the function.  This 
helper can be
   /// called from anywhere and will add a stack allocation for 'var' at the 
beginning of
@@ -454,6 +471,15 @@ class LlvmCodeGen {
   /// this LlvmCodeGen object. The module must be on the local filesystem.
   Status LinkModule(const std::string& file);
 
+  /// If there are more than this number of expr trees (or functions that 
evaluate
+  /// expressions), avoid inlining avoid inlining for the exprs exceeding this 
threshold.
+  static const int CODEGEN_INLINE_EXPRS_THRESHOLD = 100;
+
+  /// If there are more than this number of expr trees (or functions that 
evaluate
+  /// expressions), avoid inlining the function that evaluates the expression 
batch
+  /// into the calling function.
+  static const int CODEGEN_INLINE_EXPR_BATCH_THRESHOLD = 25;
+
  private:
   friend class ExprCodegenTest;
   friend class LlvmCodeGenTest;
@@ -471,11 +497,12 @@ class LlvmCodeGen {
   /// Parses all the global variables in 'module' and adds any functions 
referenced by
   /// them to the set 'ref_fns' if they are not defined in the Impalad native 
code.
   /// These functions need to be materialized to avoid linking error.
-  static void ParseGVForFunctions(llvm::Module* module,
-      boost::unordered_set<string>* ref_fns);
+  static void ParseGVForFunctions(
+      llvm::Module* module, boost::unordered_set<string>* ref_fns);
 
   /// Top level codegen object.  'module_id' is used for debugging when 
outputting the IR.
-  LlvmCodeGen(ObjectPool* pool, const std::string& module_id);
+  LlvmCodeGen(
+      ObjectPool* pool, MemTracker* parent_mem_tracker, const std::string& 
module_id);
 
   /// Initializes the jitter and execution engine with the given module.
   Status Init(std::unique_ptr<llvm::Module> module);
@@ -484,8 +511,8 @@ class LlvmCodeGen {
   /// 'codegen' will contain the created object on success. Note that the 
functions
   /// are not materialized. Getting a reference to the function via 
GetFunction()
   /// will materialize the function and its callees recursively.
-  static Status CreateFromMemory(ObjectPool* pool, const std::string& id,
-      boost::scoped_ptr<LlvmCodeGen>* codegen);
+  static Status CreateFromMemory(ObjectPool* pool, MemTracker* 
parent_mem_tracker,
+      const std::string& id, boost::scoped_ptr<LlvmCodeGen>* codegen);
 
   /// Loads an LLVM module from 'file' which is the local path to the LLVM 
bitcode file.
   /// The functions in the module are not materialized. Getting a reference to 
the
@@ -524,7 +551,7 @@ class LlvmCodeGen {
   void ResetVerification() { is_corrupt_ = false; }
 
   /// Optimizes the module. This includes pruning the module of any unused 
functions.
-  void OptimizeModule();
+  Status OptimizeModule();
 
   /// Clears generated hash fns.  This is only used for testing.
   void ClearHashFns();
@@ -570,6 +597,10 @@ class LlvmCodeGen {
   /// there is error in materializing the module.
   Status FinalizeLazyMaterialization();
 
+  /// Destroy the IR module, freeing memory used by the IR. Any machine code 
that was
+  /// generated is retained by the execution engine.
+  void DestroyModule();
+
   /// Whether InitializeLlvm() has been called.
   static bool llvm_initialized_;
 
@@ -589,6 +620,10 @@ class LlvmCodeGen {
   /// Codegen counters
   RuntimeProfile profile_;
 
+  /// MemTracker used for tracking memory consumed by codegen. Connected to a 
parent
+  /// MemTracker if one was provided during initialization.
+  boost::scoped_ptr<MemTracker> mem_tracker_;
+
   /// Time spent reading the .ir file from the file system.
   RuntimeProfile::Counter* load_module_timer_;
 
@@ -639,6 +674,9 @@ class LlvmCodeGen {
   /// Execution/Jitting engine.
   std::unique_ptr<llvm::ExecutionEngine> execution_engine_;
 
+  /// The memory manager used by 'execution_engine_'. Owned by 
'execution_engine_'.
+  ImpalaMCJITMemoryManager* memory_manager_;
+
   /// Functions parsed from pre-compiled module.  Indexed by 
ImpalaIR::Function enum
   std::vector<llvm::Function*> loaded_functions_;
 
@@ -686,8 +724,28 @@ class LlvmCodeGen {
   /// 'symbol_emitter_' are called by 'execution_engine_' when code is emitted 
or freed.
   /// The lifetime of the symbol emitter must be longer than 
'execution_engine_'.
   boost::scoped_ptr<CodegenSymbolEmitter> symbol_emitter_;
-};
 
+  /// Very rough estimate of memory in bytes that the IR and the intermediate 
data
+  /// structures used by the optimizer may consume per LLVM IR instruction to 
be
+  /// optimized (after dead code is removed). The number is chosen to avoid 
pathological
+  /// behaviour at either extreme: failing queries unnecessarily because the 
memory
+  /// estimate is too high versus having large amounts of untracked memory 
because the
+  /// estimate is too low.
+  ///
+  /// This was chosen by looking at the behaviour of TPC-H queries. Using the 
heap growth
+  /// profile from gperftools reveal that LLVM allocated ~9mb of memory for 
fragments with
+  /// ~17k total instructions in TPC-H Q2. Inspection of other TPC-H queries 
revealed
+  /// that a typical fragment from a TPC-H query is < 5,000 instructions, 
which translates
+  /// to 2.5MB, which is almost always lower than the runtime memory 
requirement of the
+  /// fragment - so we are unlikely to fail queries unnecessarily.
+  ///
+  /// This assumes optimizer memory usage scales linearly with instruction 
count. This is
+  /// true only if the size of functions is bounded, because some optimization 
passes
+  /// (e.g. global value numbering) use time and memory that is super-linear 
in relation
+  /// to the # of instructions in a function. So codegen should avoid 
generating
+  /// arbitrarily large function.
+  static constexpr int64_t ESTIMATED_OPTIMIZER_BYTES_PER_INST = 512;
+};
 }
 
 #endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/codegen/mcjit-mem-mgr.h
----------------------------------------------------------------------
diff --git a/be/src/codegen/mcjit-mem-mgr.h b/be/src/codegen/mcjit-mem-mgr.h
index 313d175..6982cef 100644
--- a/be/src/codegen/mcjit-mem-mgr.h
+++ b/be/src/codegen/mcjit-mem-mgr.h
@@ -25,19 +25,53 @@ extern void *__dso_handle __attribute__ ((__visibility__ 
("hidden")));
 
 namespace impala {
 
-/// Custom memory manager to resolve references to __dso_handle in 
cross-compiled IR.
+/// Custom memory manager. It is needed for a couple of purposes.
+///
+/// We use it as a way to resolve references to __dso_handle in cross-compiled 
IR.
 /// This uses the same approach as the legacy llvm JIT to handle __dso_handle. 
MCJIT
 /// doesn't handle those for us: see LLVM issue 18062.
 /// TODO: get rid of this by purging the cross-compiled IR of references to 
__dso_handle,
 /// which come from global variables with destructors.
+///
+/// We also use it to track how much memory is allocated for compiled code.
 class ImpalaMCJITMemoryManager : public llvm::SectionMemoryManager {
  public:
+  ImpalaMCJITMemoryManager() : bytes_allocated_(0), bytes_tracked_(0){};
+
   virtual uint64_t getSymbolAddress(const std::string& name) override {
     if (name == "__dso_handle") return 
reinterpret_cast<uint64_t>(&__dso_handle);
     return SectionMemoryManager::getSymbolAddress(name);
   }
-};
 
+  virtual uint8_t* allocateCodeSection(uintptr_t size, unsigned alignment,
+      unsigned section_id, llvm::StringRef section_name) override {
+    bytes_allocated_ += size;
+    return llvm::SectionMemoryManager::allocateCodeSection(
+        size, alignment, section_id, section_name);
+  }
+
+  virtual uint8_t* allocateDataSection(uintptr_t size, unsigned alignment,
+      unsigned section_id, llvm::StringRef section_name, bool is_read_only) 
override {
+    bytes_allocated_ += size;
+    return llvm::SectionMemoryManager::allocateDataSection(
+        size, alignment, section_id, section_name, is_read_only);
+  }
+
+  int64_t bytes_allocated() const { return bytes_allocated_; }
+  int64_t bytes_tracked() const { return bytes_tracked_; }
+  void set_bytes_tracked(int64_t bytes_tracked) {
+    DCHECK_LE(bytes_tracked, bytes_allocated_);
+    bytes_tracked_ = bytes_tracked;
+  }
+
+ private:
+  /// Total bytes allocated for the compiled code.
+  int64_t bytes_allocated_;
+
+  /// Total bytes already tracked by MemTrackers. <= 'bytes_allocated_'.
+  /// Needed to release the correct amount from the MemTracker when done.
+  int64_t bytes_tracked_;
+};
 }
 
 #endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/aggregation-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/aggregation-node.cc b/be/src/exec/aggregation-node.cc
index f3b054e..822fc5d 100644
--- a/be/src/exec/aggregation-node.cc
+++ b/be/src/exec/aggregation-node.cc
@@ -164,6 +164,9 @@ Status AggregationNode::Prepare(RuntimeState* state) {
 
 void AggregationNode::Codegen(RuntimeState* state) {
   DCHECK(state->ShouldCodegen());
+  ExecNode::Codegen(state);
+  if (IsNodeCodegenDisabled()) return;
+
   bool codegen_enabled = false;
   LlvmCodeGen* codegen = state->codegen();
   DCHECK(codegen != NULL);
@@ -178,7 +181,6 @@ void AggregationNode::Codegen(RuntimeState* state) {
     }
   }
   runtime_profile()->AddCodegenMsg(codegen_enabled);
-  ExecNode::Codegen(state);
 }
 
 Status AggregationNode::Open(RuntimeState* state) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/exchange-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/exchange-node.cc b/be/src/exec/exchange-node.cc
index 2b2b033..0bbd42b 100644
--- a/be/src/exec/exchange-node.cc
+++ b/be/src/exec/exchange-node.cc
@@ -92,11 +92,13 @@ Status ExchangeNode::Prepare(RuntimeState* state) {
 
 void ExchangeNode::Codegen(RuntimeState* state) {
   DCHECK(state->ShouldCodegen());
+  ExecNode::Codegen(state);
+  if (IsNodeCodegenDisabled()) return;
+
   if (is_merging_) {
     Status codegen_status = less_than_->Codegen(state);
     runtime_profile()->AddCodegenMsg(codegen_status.ok(), codegen_status);
   }
-  ExecNode::Codegen(state);
 }
 
 Status ExchangeNode::Open(RuntimeState* state) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/exec-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/exec-node.cc b/be/src/exec/exec-node.cc
index 95306ec..518a0dd 100644
--- a/be/src/exec/exec-node.cc
+++ b/be/src/exec/exec-node.cc
@@ -129,6 +129,7 @@ ExecNode::ExecNode(ObjectPool* pool, const TPlanNode& 
tnode, const DescriptorTbl
     rows_returned_counter_(NULL),
     rows_returned_rate_(NULL),
     containing_subplan_(NULL),
+    disable_codegen_(tnode.disable_codegen),
     is_closed_(false) {
   InitRuntimeProfile(PrintPlanNodeType(tnode.node_type));
 }
@@ -465,6 +466,10 @@ void ExecNode::AddCodegenDisabledMessage(RuntimeState* 
state) {
   }
 }
 
+bool ExecNode::IsNodeCodegenDisabled() const {
+  return disable_codegen_;
+}
+
 // Codegen for EvalConjuncts.  The generated signature is
 // For a node with two conjunct predicates
 // define i1 @EvalConjuncts(%"class.impala::ExprContext"** %ctxs, i32 
%num_ctxs,
@@ -507,6 +512,10 @@ Status ExecNode::CodegenEvalConjuncts(LlvmCodeGen* codegen,
   for (int i = 0; i < conjunct_ctxs.size(); ++i) {
     RETURN_IF_ERROR(
         conjunct_ctxs[i]->root()->GetCodegendComputeFn(codegen, 
&conjunct_fns[i]));
+    if (i >= LlvmCodeGen::CODEGEN_INLINE_EXPRS_THRESHOLD) {
+      // Avoid bloating EvalConjuncts by inlining everything into it.
+      codegen->SetNoInline(conjunct_fns[i]);
+    }
   }
 
   // Construct function signature to match
@@ -566,12 +575,16 @@ Status ExecNode::CodegenEvalConjuncts(LlvmCodeGen* 
codegen,
     builder.CreateRet(codegen->true_value());
   }
 
+  // Avoid inlining EvalConjuncts into caller if it is large.
+  if (conjunct_ctxs.size() > LlvmCodeGen::CODEGEN_INLINE_EXPR_BATCH_THRESHOLD) 
{
+    codegen->SetNoInline(*fn);
+  }
+
   *fn = codegen->FinalizeFunction(*fn);
   if (*fn == NULL) {
     return Status("ExecNode::CodegenEvalConjuncts(): codegen'd EvalConjuncts() 
function "
-        "failed verification, see log");
+                  "failed verification, see log");
   }
   return Status::OK();
 }
-
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/exec-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/exec-node.h b/be/src/exec/exec-node.h
index 1dd2d32..6a7737b 100644
--- a/be/src/exec/exec-node.h
+++ b/be/src/exec/exec-node.h
@@ -184,6 +184,10 @@ class ExecNode {
   MemTracker* mem_tracker() { return mem_tracker_.get(); }
   MemTracker* expr_mem_tracker() { return expr_mem_tracker_.get(); }
 
+  /// Return true if codegen was disabled by the planner for this ExecNode. 
Does not
+  /// check to see if codegen was enabled for the enclosing fragment.
+  bool IsNodeCodegenDisabled() const;
+
   /// Add codegen disabled message if codegen is disabled for this ExecNode.
   void AddCodegenDisabledMessage(RuntimeState* state);
 
@@ -276,6 +280,9 @@ class ExecNode {
   /// Valid to call in or after Prepare().
   bool IsInSubplan() const { return containing_subplan_ != NULL; }
 
+  /// If true, codegen should be disabled for this exec node.
+  const bool disable_codegen_;
+
   /// Create a single exec node derived from thrift node; place exec node in 
'pool'.
   static Status CreateNode(ObjectPool* pool, const TPlanNode& tnode,
       const DescriptorTbl& descs, ExecNode** node, RuntimeState* state);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/hash-join-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hash-join-node.cc b/be/src/exec/hash-join-node.cc
index a18f356..77807f6 100644
--- a/be/src/exec/hash-join-node.cc
+++ b/be/src/exec/hash-join-node.cc
@@ -134,14 +134,13 @@ Status HashJoinNode::Prepare(RuntimeState* state) {
   AddExprCtxsToFree(other_join_conjunct_ctxs_);
 
   // TODO: default buckets
-  const bool stores_nulls = join_op_ == TJoinOp::RIGHT_OUTER_JOIN ||
-      join_op_ == TJoinOp::FULL_OUTER_JOIN ||
-      std::accumulate(is_not_distinct_from_.begin(), 
is_not_distinct_from_.end(), false,
-                      std::logical_or<bool>());
+  const bool stores_nulls = join_op_ == TJoinOp::RIGHT_OUTER_JOIN
+      || join_op_ == TJoinOp::FULL_OUTER_JOIN
+      || std::accumulate(is_not_distinct_from_.begin(), 
is_not_distinct_from_.end(),
+                                false, std::logical_or<bool>());
   hash_tbl_.reset(new OldHashTable(state, build_expr_ctxs_, probe_expr_ctxs_,
-          filter_expr_ctxs_,
-          child(1)->row_desc().tuple_descriptors().size(), stores_nulls,
-          is_not_distinct_from_, state->fragment_hash_seed(), mem_tracker(), 
filters_));
+      filter_expr_ctxs_, child(1)->row_desc().tuple_descriptors().size(), 
stores_nulls,
+      is_not_distinct_from_, state->fragment_hash_seed(), mem_tracker(), 
filters_));
   build_pool_.reset(new MemPool(mem_tracker()));
   AddCodegenDisabledMessage(state);
   return Status::OK();
@@ -149,6 +148,9 @@ Status HashJoinNode::Prepare(RuntimeState* state) {
 
 void HashJoinNode::Codegen(RuntimeState* state) {
   DCHECK(state->ShouldCodegen());
+  ExecNode::Codegen(state);
+  if (IsNodeCodegenDisabled()) return;
+
   LlvmCodeGen* codegen = state->codegen();
   DCHECK(codegen != NULL);
   bool build_codegen_enabled = false;
@@ -178,7 +180,6 @@ void HashJoinNode::Codegen(RuntimeState* state) {
   }
   runtime_profile()->AddCodegenMsg(build_codegen_enabled, "", "Build Side");
   runtime_profile()->AddCodegenMsg(probe_codegen_enabled, "", "Probe Side");
-  ExecNode::Codegen(state);
 }
 
 Status HashJoinNode::Reset(RuntimeState* state) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/hash-table.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hash-table.cc b/be/src/exec/hash-table.cc
index 8d45ec7..f4ecb67 100644
--- a/be/src/exec/hash-table.cc
+++ b/be/src/exec/hash-table.cc
@@ -755,13 +755,18 @@ Status HashTableCtx::CodegenEvalRow(LlvmCodeGen* codegen, 
bool build, Function**
     if (!status.ok()) {
       (*fn)->eraseFromParent(); // deletes function
       *fn = NULL;
-      return Status(Substitute("Problem with HashTableCtx::CodegenEvalRow(): 
$0",
-          status.GetDetail()));
+      return Status(Substitute(
+          "Problem with HashTableCtx::CodegenEvalRow(): $0", 
status.GetDetail()));
     }
 
-    Value* get_expr_ctx_args[] = { this_ptr, codegen->GetIntConstant(TYPE_INT, 
i) };
+    // Avoid bloating function by inlining too many exprs into it.
+    if (i >= LlvmCodeGen::CODEGEN_INLINE_EXPRS_THRESHOLD) {
+      codegen->SetNoInline(expr_fn);
+    }
+
+    Value* get_expr_ctx_args[] = {this_ptr, codegen->GetIntConstant(TYPE_INT, 
i)};
     Value* ctx_arg = builder.CreateCall(get_expr_ctx_fn, get_expr_ctx_args, 
"expr_ctx");
-    Value* expr_fn_args[] = { ctx_arg, row };
+    Value* expr_fn_args[] = {ctx_arg, row};
     CodegenAnyVal result = CodegenAnyVal::CreateCallWrapped(
         codegen, &builder, ctxs[i]->root()->type(), expr_fn, expr_fn_args, 
"result");
     Value* is_null = result.GetIsNull();
@@ -800,10 +805,15 @@ Status HashTableCtx::CodegenEvalRow(LlvmCodeGen* codegen, 
bool build, Function**
   }
   builder.CreateRet(has_null);
 
+  // Avoid inlining a large EvalRow() function into caller.
+  if (ctxs.size() > LlvmCodeGen::CODEGEN_INLINE_EXPR_BATCH_THRESHOLD) {
+    codegen->SetNoInline(*fn);
+  }
+
   *fn = codegen->FinalizeFunction(*fn);
   if (*fn == NULL) {
     return Status("Codegen'd HashTableCtx::EvalRow() function failed 
verification, "
-        "see log");
+                  "see log");
   }
   return Status::OK();
 }
@@ -972,6 +982,11 @@ Status HashTableCtx::CodegenHashRow(LlvmCodeGen* codegen, 
bool use_murmur, Funct
   }
 
   builder.CreateRet(hash_result);
+
+  // Avoid inlining into caller if there are many exprs.
+  if (build_expr_ctxs_.size() > 
LlvmCodeGen::CODEGEN_INLINE_EXPR_BATCH_THRESHOLD) {
+    codegen->SetNoInline(*fn);
+  }
   *fn = codegen->FinalizeFunction(*fn);
   if (*fn == NULL) {
     return Status(
@@ -1095,12 +1110,16 @@ Status HashTableCtx::CodegenEquals(LlvmCodeGen* 
codegen, bool force_null_equalit
     if (!status.ok()) {
       (*fn)->eraseFromParent(); // deletes function
       *fn = NULL;
-      return Status(Substitute("Problem with HashTableCtx::CodegenEquals: $0",
-          status.GetDetail()));
+      return Status(
+          Substitute("Problem with HashTableCtx::CodegenEquals: $0", 
status.GetDetail()));
+    }
+    if (build_expr_ctxs_.size() > LlvmCodeGen::CODEGEN_INLINE_EXPRS_THRESHOLD) 
{
+      // Avoid bloating function by inlining too many exprs into it.
+      codegen->SetNoInline(expr_fn);
     }
 
     // Load ExprContext* from 'build_expr_ctxs_'.
-    Value* get_expr_ctx_args[] = { this_ptr, codegen->GetIntConstant(TYPE_INT, 
i) };
+    Value* get_expr_ctx_args[] = {this_ptr, codegen->GetIntConstant(TYPE_INT, 
i)};
     Value* ctx_arg = builder.CreateCall(get_expr_ctx_fn, get_expr_ctx_args, 
"expr_ctx");
 
     // Evaluate the expression.
@@ -1156,10 +1175,14 @@ Status HashTableCtx::CodegenEquals(LlvmCodeGen* 
codegen, bool force_null_equalit
   builder.SetInsertPoint(false_block);
   builder.CreateRet(codegen->false_value());
 
+  // Avoid inlining into caller if it is large.
+  if (build_expr_ctxs_.size() > 
LlvmCodeGen::CODEGEN_INLINE_EXPR_BATCH_THRESHOLD) {
+    codegen->SetNoInline(*fn);
+  }
   *fn = codegen->FinalizeFunction(*fn);
   if (*fn == NULL) {
     return Status("Codegen'd HashTableCtx::Equals() function failed 
verification, "
-        "see log");
+                  "see log");
   }
   return Status::OK();
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/hdfs-scan-node-base.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scan-node-base.cc 
b/be/src/exec/hdfs-scan-node-base.cc
index e616faf..5b97e76 100644
--- a/be/src/exec/hdfs-scan-node-base.cc
+++ b/be/src/exec/hdfs-scan-node-base.cc
@@ -297,6 +297,10 @@ Status HdfsScanNodeBase::Prepare(RuntimeState* state) {
 }
 
 void HdfsScanNodeBase::Codegen(RuntimeState* state) {
+  DCHECK(state->ShouldCodegen());
+  ExecNode::Codegen(state);
+  if (IsNodeCodegenDisabled()) return;
+
   // Create codegen'd functions
   for (int format = THdfsFileFormat::TEXT; format <= THdfsFileFormat::PARQUET; 
++format) {
     vector<HdfsFileDesc*>& file_descs =
@@ -343,7 +347,6 @@ void HdfsScanNodeBase::Codegen(RuntimeState* state) {
     }
     runtime_profile()->AddCodegenMsg(status.ok(), status, format_name);
   }
-  ExecNode::Codegen(state);
 }
 
 Status HdfsScanNodeBase::Open(RuntimeState* state) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/hdfs-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scanner.cc b/be/src/exec/hdfs-scanner.cc
index 390085f..c6a5f37 100644
--- a/be/src/exec/hdfs-scanner.cc
+++ b/be/src/exec/hdfs-scanner.cc
@@ -332,6 +332,7 @@ Status 
HdfsScanner::CodegenWriteCompleteTuple(HdfsScanNodeBase* node,
         node->hdfs_table()->null_column_value().data(),
         node->hdfs_table()->null_column_value().size(), true, 
state->strict_mode());
     if (fn == NULL) return Status("CodegenWriteSlot failed.");
+    if (i >= LlvmCodeGen::CODEGEN_INLINE_EXPRS_THRESHOLD) 
codegen->SetNoInline(fn);
     slot_fns.push_back(fn);
   }
 
@@ -487,6 +488,10 @@ Status 
HdfsScanner::CodegenWriteCompleteTuple(HdfsScanNodeBase* node,
         fn->eraseFromParent();
         return status;
       }
+      if (node->materialized_slots().size() + conjunct_idx
+          >= LlvmCodeGen::CODEGEN_INLINE_EXPRS_THRESHOLD) {
+        codegen->SetNoInline(conjunct_fn);
+      }
 
       Function* get_ctx_fn =
           codegen->GetFunction(IRFunction::HDFS_SCANNER_GET_CONJUNCT_CTX, 
false);
@@ -505,6 +510,10 @@ Status 
HdfsScanner::CodegenWriteCompleteTuple(HdfsScanNodeBase* node,
   builder.SetInsertPoint(eval_fail_block);
   builder.CreateRet(codegen->false_value());
 
+  if (node->materialized_slots().size() + conjunct_ctxs.size()
+      > LlvmCodeGen::CODEGEN_INLINE_EXPR_BATCH_THRESHOLD) {
+    codegen->SetNoInline(fn);
+  }
   *write_complete_tuple_fn = codegen->FinalizeFunction(fn);
   if (*write_complete_tuple_fn == NULL) {
     return Status("Failed to finalize write_complete_tuple_fn.");

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/partitioned-aggregation-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-aggregation-node.cc 
b/be/src/exec/partitioned-aggregation-node.cc
index 01622b9..6cc36be 100644
--- a/be/src/exec/partitioned-aggregation-node.cc
+++ b/be/src/exec/partitioned-aggregation-node.cc
@@ -281,14 +281,16 @@ Status PartitionedAggregationNode::Prepare(RuntimeState* 
state) {
 
 void PartitionedAggregationNode::Codegen(RuntimeState* state) {
   DCHECK(state->ShouldCodegen());
+  ExecNode::Codegen(state);
+  if (IsNodeCodegenDisabled()) return;
+
   LlvmCodeGen* codegen = state->codegen();
   DCHECK(codegen != NULL);
   TPrefetchMode::type prefetch_mode = state_->query_options().prefetch_mode;
-  Status codegen_status =
-     is_streaming_preagg_ ? CodegenProcessBatchStreaming(codegen, 
prefetch_mode) :
-          CodegenProcessBatch(codegen, prefetch_mode);
+  Status codegen_status = is_streaming_preagg_ ?
+      CodegenProcessBatchStreaming(codegen, prefetch_mode) :
+      CodegenProcessBatch(codegen, prefetch_mode);
   runtime_profile()->AddCodegenMsg(codegen_status.ok(), codegen_status);
-  ExecNode::Codegen(state);
 }
 
 Status PartitionedAggregationNode::Open(RuntimeState* state) {
@@ -1553,7 +1555,8 @@ Status 
PartitionedAggregationNode::QueryMaintenance(RuntimeState* state) {
 // }
 //
 Status PartitionedAggregationNode::CodegenUpdateSlot(LlvmCodeGen* codegen,
-    AggFnEvaluator* evaluator, SlotDescriptor* slot_desc, Function** fn) {
+    AggFnEvaluator* evaluator, int evaluator_idx, SlotDescriptor* slot_desc,
+    Function** fn) {
   PointerType* fn_ctx_type =
       codegen->GetPtrType(FunctionContextImpl::LLVM_FUNCTIONCONTEXT_NAME);
   PointerType* expr_ctxs_type =
@@ -1697,10 +1700,18 @@ Status 
PartitionedAggregationNode::CodegenUpdateSlot(LlvmCodeGen* codegen,
   builder.SetInsertPoint(ret_block);
   builder.CreateRetVoid();
 
+  // Avoid producing huge UpdateTuple() function after inlining - LLVM's 
optimiser
+  // memory/CPU usage scales super-linearly with function size.
+  // E.g. compute stats on all columns of a 1000-column table previously took 
4 minutes to
+  // codegen because all the UpdateSlot() functions were inlined.
+  if (evaluator_idx >= LlvmCodeGen::CODEGEN_INLINE_EXPRS_THRESHOLD) {
+    codegen->SetNoInline(*fn);
+  }
+
   *fn = codegen->FinalizeFunction(*fn);
   if (*fn == NULL) {
     return Status("PartitionedAggregationNode::CodegenUpdateSlot(): codegen'd "
-        "UpdateSlot() function failed verification, see log");
+                  "UpdateSlot() function failed verification, see log");
   }
   return Status::OK();
 }
@@ -1874,7 +1885,8 @@ Status PartitionedAggregationNode::CodegenUpdateTuple(
       builder.CreateStore(count_inc, slot_ptr);
     } else {
       Function* update_slot_fn;
-      RETURN_IF_ERROR(CodegenUpdateSlot(codegen, evaluator, slot_desc, 
&update_slot_fn));
+      RETURN_IF_ERROR(
+          CodegenUpdateSlot(codegen, evaluator, i, slot_desc, 
&update_slot_fn));
       Value* agg_fn_ctx_ptr = builder.CreateConstGEP1_32(agg_fn_ctxs_arg, i);
       Value* agg_fn_ctx = builder.CreateLoad(agg_fn_ctx_ptr, "agg_fn_ctx");
       // Call GetExprCtx() to get the expression context.
@@ -1887,6 +1899,12 @@ Status PartitionedAggregationNode::CodegenUpdateTuple(
   }
   builder.CreateRetVoid();
 
+  // Avoid inlining big UpdateTuple function into outer loop - we're unlikely 
to get
+  // any benefit from it since the function call overhead will be amortized.
+  if (aggregate_evaluators_.size() > 
LlvmCodeGen::CODEGEN_INLINE_EXPR_BATCH_THRESHOLD) {
+    codegen->SetNoInline(*fn);
+  }
+
   // CodegenProcessBatch() does the final optimizations.
   *fn = codegen->FinalizeFunction(*fn);
   if (*fn == NULL) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/partitioned-aggregation-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-aggregation-node.h 
b/be/src/exec/partitioned-aggregation-node.h
index 4287cea..54840c7 100644
--- a/be/src/exec/partitioned-aggregation-node.h
+++ b/be/src/exec/partitioned-aggregation-node.h
@@ -643,7 +643,7 @@ class PartitionedAggregationNode : public ExecNode {
   /// Codegen UpdateSlot(). Returns non-OK status if codegen is unsuccessful.
   /// Assumes is_merge = false;
   Status CodegenUpdateSlot(LlvmCodeGen* codegen, AggFnEvaluator* evaluator,
-      SlotDescriptor* slot_desc, llvm::Function** fn);
+      int evaluator_idx, SlotDescriptor* slot_desc, llvm::Function** fn);
 
   /// Codegen a call to a function implementing the UDA interface with input 
values
   /// from 'input_vals'. 'dst_val' should contain the previous value of the 
aggregate

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/partitioned-hash-join-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-node.cc 
b/be/src/exec/partitioned-hash-join-node.cc
index 61b7eb7..6073486 100644
--- a/be/src/exec/partitioned-hash-join-node.cc
+++ b/be/src/exec/partitioned-hash-join-node.cc
@@ -141,6 +141,10 @@ Status PartitionedHashJoinNode::Prepare(RuntimeState* 
state) {
 
 void PartitionedHashJoinNode::Codegen(RuntimeState* state) {
   DCHECK(state->ShouldCodegen());
+  // Codegen the children node;
+  ExecNode::Codegen(state);
+  if (IsNodeCodegenDisabled()) return;
+
   LlvmCodeGen* codegen = state->codegen();
   DCHECK(codegen != NULL);
 
@@ -150,11 +154,8 @@ void PartitionedHashJoinNode::Codegen(RuntimeState* state) 
{
   // Codegen the probe side.
   TPrefetchMode::type prefetch_mode = state->query_options().prefetch_mode;
   Status probe_codegen_status = CodegenProcessProbeBatch(codegen, 
prefetch_mode);
-  runtime_profile()->AddCodegenMsg(probe_codegen_status.ok(), 
probe_codegen_status,
-      "Probe Side");
-
-  // Codegen the children node;
-  ExecNode::Codegen(state);
+  runtime_profile()->AddCodegenMsg(
+      probe_codegen_status.ok(), probe_codegen_status, "Probe Side");
 }
 
 Status PartitionedHashJoinNode::Open(RuntimeState* state) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/sort-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/sort-node.cc b/be/src/exec/sort-node.cc
index 7518223..1a8de83 100644
--- a/be/src/exec/sort-node.cc
+++ b/be/src/exec/sort-node.cc
@@ -51,8 +51,9 @@ Status SortNode::Prepare(RuntimeState* state) {
       state, child(0)->row_desc(), row_descriptor_, expr_mem_tracker()));
   AddExprCtxsToFree(sort_exec_exprs_);
   less_than_.reset(new TupleRowComparator(sort_exec_exprs_, is_asc_order_, 
nulls_first_));
-  sorter_.reset(new Sorter(*less_than_.get(), 
sort_exec_exprs_.sort_tuple_slot_expr_ctxs(),
-      &row_descriptor_, mem_tracker(), runtime_profile(), state));
+  sorter_.reset(
+      new Sorter(*less_than_.get(), 
sort_exec_exprs_.sort_tuple_slot_expr_ctxs(),
+          &row_descriptor_, mem_tracker(), runtime_profile(), state));
   RETURN_IF_ERROR(sorter_->Init());
   AddCodegenDisabledMessage(state);
   return Status::OK();
@@ -60,9 +61,11 @@ Status SortNode::Prepare(RuntimeState* state) {
 
 void SortNode::Codegen(RuntimeState* state) {
   DCHECK(state->ShouldCodegen());
+  ExecNode::Codegen(state);
+  if (IsNodeCodegenDisabled()) return;
+
   Status codegen_status = less_than_->Codegen(state);
   runtime_profile()->AddCodegenMsg(codegen_status.ok(), codegen_status);
-  ExecNode::Codegen(state);
 }
 
 Status SortNode::Open(RuntimeState* state) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/topn-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/topn-node.cc b/be/src/exec/topn-node.cc
index bd4caec..a7f11bd 100644
--- a/be/src/exec/topn-node.cc
+++ b/be/src/exec/topn-node.cc
@@ -84,6 +84,9 @@ Status TopNNode::Prepare(RuntimeState* state) {
 
 void TopNNode::Codegen(RuntimeState* state) {
   DCHECK(state->ShouldCodegen());
+  ExecNode::Codegen(state);
+  if (IsNodeCodegenDisabled()) return;
+
   LlvmCodeGen* codegen = state->codegen();
   DCHECK(codegen != NULL);
 
@@ -126,7 +129,6 @@ void TopNNode::Codegen(RuntimeState* state) {
     }
   }
   runtime_profile()->AddCodegenMsg(codegen_status.ok(), codegen_status);
-  ExecNode::Codegen(state);
 }
 
 Status TopNNode::Open(RuntimeState* state) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exprs/aggregate-functions-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/aggregate-functions-ir.cc 
b/be/src/exprs/aggregate-functions-ir.cc
index 7ed6386..f6a91f4 100644
--- a/be/src/exprs/aggregate-functions-ir.cc
+++ b/be/src/exprs/aggregate-functions-ir.cc
@@ -1190,8 +1190,26 @@ void AggregateFunctions::HllUpdate(FunctionContext* ctx, 
const T& src, StringVal
   }
 }
 
-void AggregateFunctions::HllMerge(FunctionContext* ctx, const StringVal& src,
-    StringVal* dst) {
+// Specialize for DecimalVal to allow substituting decimal size.
+template <>
+void AggregateFunctions::HllUpdate(
+    FunctionContext* ctx, const DecimalVal& src, StringVal* dst) {
+  if (src.is_null) return;
+  DCHECK(!dst->is_null);
+  DCHECK_EQ(dst->len, HLL_LEN);
+  uint64_t hash_value = AnyValUtil::HashDecimal64(
+      src, Expr::GetConstantInt(*ctx, Expr::ARG_TYPE_SIZE, 0), 
HashUtil::FNV64_SEED);
+  if (hash_value != 0) {
+    // Use the lower bits to index into the number of streams and then
+    // find the first 1 bit after the index bits.
+    int idx = hash_value & (HLL_LEN - 1);
+    uint8_t first_one_bit = __builtin_ctzl(hash_value >> HLL_PRECISION) + 1;
+    dst->ptr[idx] = ::max(dst->ptr[idx], first_one_bit);
+  }
+}
+
+void AggregateFunctions::HllMerge(
+    FunctionContext* ctx, const StringVal& src, StringVal* dst) {
   DCHECK(!dst->is_null);
   DCHECK(!src.is_null);
   DCHECK_EQ(dst->len, HLL_LEN);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exprs/anyval-util.h
----------------------------------------------------------------------
diff --git a/be/src/exprs/anyval-util.h b/be/src/exprs/anyval-util.h
index 8c75247..4703266 100644
--- a/be/src/exprs/anyval-util.h
+++ b/be/src/exprs/anyval-util.h
@@ -133,12 +133,19 @@ class AnyValUtil {
     return HashUtil::MurmurHash2_64(&tv, 12, seed);
   }
 
-  static uint64_t Hash64(const DecimalVal& v, const FunctionContext::TypeDesc& 
t,
-      int64_t seed) {
-    switch (ColumnType::GetDecimalByteSize(t.precision)) {
-      case 4: return HashUtil::MurmurHash2_64(&v.val4, 4, seed);
-      case 8: return HashUtil::MurmurHash2_64(&v.val8, 8, seed);
-      case 16: return HashUtil::MurmurHash2_64(&v.val16, 16, seed);
+  static uint64_t Hash64(
+      const DecimalVal& v, const FunctionContext::TypeDesc& t, int64_t seed) {
+    return HashDecimal64(v, ColumnType::GetDecimalByteSize(t.precision), seed);
+  }
+
+  static uint64_t HashDecimal64(const DecimalVal& v, int byte_size, int64_t 
seed) {
+    switch (byte_size) {
+      case 4:
+        return HashUtil::MurmurHash2_64(&v.val4, 4, seed);
+      case 8:
+        return HashUtil::MurmurHash2_64(&v.val8, 8, seed);
+      case 16:
+        return HashUtil::MurmurHash2_64(&v.val16, 16, seed);
       default:
         DCHECK(false);
         return 0;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exprs/expr-codegen-test.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/expr-codegen-test.cc 
b/be/src/exprs/expr-codegen-test.cc
index 6ccbbf4..cbb5617 100644
--- a/be/src/exprs/expr-codegen-test.cc
+++ b/be/src/exprs/expr-codegen-test.cc
@@ -251,7 +251,8 @@ TEST_F(ExprCodegenTest, TestInlineConstants) {
   stringstream test_udf_file;
   test_udf_file << getenv("IMPALA_HOME") << 
"/be/build/latest/exprs/expr-codegen-test.ll";
   scoped_ptr<LlvmCodeGen> codegen;
-  ASSERT_OK(LlvmCodeGen::CreateFromFile(&pool, test_udf_file.str(), "test", 
&codegen));
+  ASSERT_OK(
+      LlvmCodeGen::CreateFromFile(&pool, NULL, test_udf_file.str(), "test", 
&codegen));
   Function* fn = codegen->GetFunction(TEST_GET_CONSTANT_SYMBOL, false);
   ASSERT_TRUE(fn != NULL);
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exprs/expr.h
----------------------------------------------------------------------
diff --git a/be/src/exprs/expr.h b/be/src/exprs/expr.h
index 552fee9..942eec0 100644
--- a/be/src/exprs/expr.h
+++ b/be/src/exprs/expr.h
@@ -361,6 +361,7 @@ class Expr {
 
   /// recognize if this node is a slotref in order to speed up GetValue()
   const bool is_slotref_;
+
   /// analysis is done, types are fixed at this point
   const ColumnType type_;
   std::vector<Expr*> children_;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/runtime/lib-cache.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/lib-cache.cc b/be/src/runtime/lib-cache.cc
index fe8d180..00c1159 100644
--- a/be/src/runtime/lib-cache.cc
+++ b/be/src/runtime/lib-cache.cc
@@ -403,8 +403,8 @@ Status LibCache::GetCacheEntryInternal(const string& 
hdfs_lib_file, LibType type
     ObjectPool pool;
     scoped_ptr<LlvmCodeGen> codegen;
     string module_id = filesystem::path((*entry)->local_path).stem().string();
-    RETURN_IF_ERROR(
-        LlvmCodeGen::CreateFromFile(&pool, (*entry)->local_path, module_id, 
&codegen));
+    RETURN_IF_ERROR(LlvmCodeGen::CreateFromFile(
+        &pool, NULL, (*entry)->local_path, module_id, &codegen));
     codegen->GetSymbols(&(*entry)->symbols);
   } else {
     DCHECK_EQ(type, TYPE_JAR);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/runtime/runtime-state.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/runtime-state.cc b/be/src/runtime/runtime-state.cc
index 40a4946..37d5fb0 100644
--- a/be/src/runtime/runtime-state.cc
+++ b/be/src/runtime/runtime-state.cc
@@ -96,16 +96,16 @@ RuntimeState::RuntimeState(const TQueryCtx& query_ctx)
 RuntimeState::~RuntimeState() {
   block_mgr_.reset();
 
+  // Release codegen memory before tearing down trackers.
+  codegen_.reset();
+
   // query_mem_tracker_ must be valid as long as instance_mem_tracker_ is so
   // delete instance_mem_tracker_ first.
   // LogUsage() walks the MemTracker tree top-down when the memory limit is 
exceeded.
   // Break the link between the instance_mem_tracker and its parent 
(query_mem_tracker_)
   // before the instance_mem_tracker_ and its children are destroyed.
-  if (instance_mem_tracker_.get() != NULL) {
-    // May be NULL if InitMemTrackers() is not called, for example from tests.
-    instance_mem_tracker_->UnregisterFromParent();
-  }
-
+  // May be NULL if InitMemTrackers() is not called, for example from tests.
+  if (instance_mem_tracker_ != NULL) 
instance_mem_tracker_->UnregisterFromParent();
   instance_mem_tracker_.reset();
   query_mem_tracker_.reset();
 }
@@ -184,8 +184,8 @@ Status RuntimeState::CreateBlockMgr() {
 Status RuntimeState::CreateCodegen() {
   if (codegen_.get() != NULL) return Status::OK();
   // TODO: add the fragment ID to the codegen ID as well
-  RETURN_IF_ERROR(LlvmCodeGen::CreateImpalaCodegen(
-      obj_pool_.get(), PrintId(fragment_instance_id()), &codegen_));
+  RETURN_IF_ERROR(LlvmCodeGen::CreateImpalaCodegen(obj_pool_.get(),
+      instance_mem_tracker_.get(), PrintId(fragment_instance_id()), 
&codegen_));
   codegen_->EnableOptimizations(true);
   profile_.AddChild(codegen_->runtime_profile());
   return Status::OK();

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/common/thrift/PlanNodes.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/PlanNodes.thrift b/common/thrift/PlanNodes.thrift
index 6f863b2..233fbb2 100644
--- a/common/thrift/PlanNodes.thrift
+++ b/common/thrift/PlanNodes.thrift
@@ -463,32 +463,36 @@ struct TPlanNode {
   6: required list<bool> nullable_tuples
   7: optional list<Exprs.TExpr> conjuncts
 
+  // Set to true if codegen should be disabled for this plan node. Otherwise 
the plan
+  // node is codegen'd if the backend supports it.
+  8: required bool disable_codegen
+
   // one field per PlanNode subclass
-  8: optional THdfsScanNode hdfs_scan_node
-  9: optional THBaseScanNode hbase_scan_node
-  23: optional TKuduScanNode kudu_scan_node
-  10: optional TDataSourceScanNode data_source_node
-  11: optional THashJoinNode hash_join_node
-  12: optional TNestedLoopJoinNode nested_loop_join_node
-  13: optional TAggregationNode agg_node
-  14: optional TSortNode sort_node
-  15: optional TUnionNode union_node
-  16: optional TExchangeNode exchange_node
-  17: optional TAnalyticNode analytic_node
-  21: optional TUnnestNode unnest_node
+  9: optional THdfsScanNode hdfs_scan_node
+  10: optional THBaseScanNode hbase_scan_node
+  11: optional TKuduScanNode kudu_scan_node
+  12: optional TDataSourceScanNode data_source_node
+  13: optional THashJoinNode hash_join_node
+  14: optional TNestedLoopJoinNode nested_loop_join_node
+  15: optional TAggregationNode agg_node
+  16: optional TSortNode sort_node
+  17: optional TUnionNode union_node
+  18: optional TExchangeNode exchange_node
+  19: optional TAnalyticNode analytic_node
+  20: optional TUnnestNode unnest_node
 
   // Label that should be used to print this node to the user.
-  18: optional string label
+  21: optional string label
 
   // Additional details that should be printed to the user. This is node 
specific
   // e.g. table name, join strategy, etc.
-  19: optional string label_detail
+  22: optional string label_detail
 
   // Estimated execution stats generated by the planner.
-  20: optional ExecStats.TExecStats estimated_stats
+  23: optional ExecStats.TExecStats estimated_stats
 
   // Runtime filters assigned to this plan node
-  22: optional list<TRuntimeFilterDesc> runtime_filters
+  24: optional list<TRuntimeFilterDesc> runtime_filters
 }
 
 // A flattened representation of a tree of PlanNodes, obtained by depth-first

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/fe/src/main/java/org/apache/impala/planner/DistributedPlanner.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/DistributedPlanner.java 
b/fe/src/main/java/org/apache/impala/planner/DistributedPlanner.java
index 620ea56..24d7caa 100644
--- a/fe/src/main/java/org/apache/impala/planner/DistributedPlanner.java
+++ b/fe/src/main/java/org/apache/impala/planner/DistributedPlanner.java
@@ -818,6 +818,12 @@ public class DistributedPlanner {
         mergeFragment.getPlanRoot(), node.getAggInfo().getMergeAggInfo());
     mergeAggNode.init(ctx_.getRootAnalyzer());
     mergeAggNode.setLimit(limit);
+    // Merge of non-grouping agg only processes one tuple per Impala daemon - 
codegen
+    // will cost more than benefit.
+    if (!hasGrouping) {
+      mergeFragment.getPlanRoot().setDisableCodegen(true);
+      mergeAggNode.setDisableCodegen(true);
+    }
 
     // HAVING predicates can only be evaluated after the merge agg step
     node.transferConjuncts(mergeAggNode);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/fe/src/main/java/org/apache/impala/planner/PlanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/PlanNode.java 
b/fe/src/main/java/org/apache/impala/planner/PlanNode.java
index 141464e..312ea5b 100644
--- a/fe/src/main/java/org/apache/impala/planner/PlanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/PlanNode.java
@@ -120,6 +120,9 @@ abstract public class PlanNode extends TreeNode<PlanNode> {
   // set in computeCosts(); invalid: -1
   protected long perHostMemCost_ = -1;
 
+  // If true, disable codegen for this plan node.
+  protected boolean disableCodegen_;
+
   // Runtime filters assigned to this node.
   protected List<RuntimeFilter> runtimeFilters_ = Lists.newArrayList();
 
@@ -144,6 +147,7 @@ abstract public class PlanNode extends TreeNode<PlanNode> {
     cardinality_ = -1;
     numNodes_ = -1;
     displayName_ = displayName;
+    disableCodegen_ = false;
   }
 
   /**
@@ -159,6 +163,7 @@ abstract public class PlanNode extends TreeNode<PlanNode> {
     cardinality_ = -1;
     numNodes_ = -1;
     displayName_ = displayName;
+    disableCodegen_ = node.disableCodegen_;
   }
 
   /**
@@ -393,9 +398,10 @@ abstract public class PlanNode extends TreeNode<PlanNode> {
       msg.addToConjuncts(e.treeToThrift());
     }
     // Serialize any runtime filters
-    for (RuntimeFilter filter: runtimeFilters_) {
+    for (RuntimeFilter filter : runtimeFilters_) {
       msg.addToRuntime_filters(filter.toThrift());
     }
+    msg.setDisable_codegen(disableCodegen_);
     toThrift(msg);
     container.addToNodes(msg);
     // For the purpose of the BE consider ExchangeNodes to have no children.
@@ -718,4 +724,8 @@ abstract public class PlanNode extends TreeNode<PlanNode> {
 
     return sortedConjuncts;
   }
+
+  public void setDisableCodegen(boolean disableCodegen) {
+    disableCodegen_ = disableCodegen;
+  }
 }

[6/6] incubator-impala git commit: IMPALA-4397, IMPALA-3259: reduce codegen time and memory

Reply via email to