IMPALA-4397,IMPALA-3259: reduce codegen time and memory A handful of fixes to codegen memory usage: * Delete the IR module when we're done with it (it can be fairly large) * Track the compiled code size (typically not that large, but it can add up if there are many fragments). * Estimate optimisation memory requirements and track it in the memory tracker. This is very crude but much better than not tracking it.
A handful of fixes to improve codegen time/cost, particularly targeted at compute stats workloads: * Avoid over-inlining when there are many aggregate functions, conjuncts, etc by adding "NoInline" attributes. * Don't codegen non-grouping merge aggregations. They will only process one row per Impala daemon, so codegen is not worth it. * Make the Hll algorithm more efficient by specialising the hash function based on decimal width. Limitations: * This doesn't tackle over-inlining of large expr trees, but a similar approach will be used there in a follow-on patch. Perf: Compute stats on functional_parquet.widetable_1000_cols goes from 1min+ of codegen to ~ 5s codegen on my machine. Local perf runs of tpc-h and targeted perf showed no regressions and some moderate improvements (1-2%). Also did an experiment to understand the perf consequences of disabling inlining. I manually set CODEGEN_INLINE_EXPRS_THRESHOLD to 0, and ran: drop stats tpch_20_parquet.lineitem compute stats tpch_20_parquet.lineitem; There was no difference in time spent in the agg node: 30.7s with inlining, 30.5s without. Change-Id: Id10015b49da182cb181a653ac8464b4a18b71091 Reviewed-on: http://gerrit.cloudera.org:8080/4956 Reviewed-by: Tim Armstrong <[email protected]> Reviewed-by: Marcel Kornacker <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/4db330e6 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/4db330e6 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/4db330e6 Branch: refs/heads/master Commit: 4db330e69a2dbb4a23f46e34b484da0d6b9ef29b Parents: 696fb68 Author: Tim Armstrong <[email protected]> Authored: Tue Oct 4 17:09:17 2016 -0700 Committer: Internal Jenkins <[email protected]> Committed: Wed Nov 23 08:18:17 2016 +0000 ---------------------------------------------------------------------- be/src/benchmarks/hash-benchmark.cc | 2 +- be/src/codegen/llvm-codegen-test.cc | 40 +- be/src/codegen/llvm-codegen.cc | 114 +- be/src/codegen/llvm-codegen.h | 84 +- be/src/codegen/mcjit-mem-mgr.h | 38 +- be/src/exec/aggregation-node.cc | 4 +- be/src/exec/exchange-node.cc | 4 +- be/src/exec/exec-node.cc | 17 +- be/src/exec/exec-node.h | 7 + be/src/exec/hash-join-node.cc | 17 +- be/src/exec/hash-table.cc | 41 +- be/src/exec/hdfs-scan-node-base.cc | 5 +- be/src/exec/hdfs-scanner.cc | 9 + be/src/exec/partitioned-aggregation-node.cc | 32 +- be/src/exec/partitioned-aggregation-node.h | 2 +- be/src/exec/partitioned-hash-join-node.cc | 11 +- be/src/exec/sort-node.cc | 9 +- be/src/exec/topn-node.cc | 4 +- be/src/exprs/aggregate-functions-ir.cc | 22 +- be/src/exprs/anyval-util.h | 19 +- be/src/exprs/expr-codegen-test.cc | 3 +- be/src/exprs/expr.h | 1 + be/src/runtime/lib-cache.cc | 4 +- be/src/runtime/runtime-state.cc | 14 +- common/thrift/PlanNodes.thrift | 36 +- .../impala/planner/DistributedPlanner.java | 6 + .../org/apache/impala/planner/PlanNode.java | 12 +- .../queries/QueryTest/compute-stats.test | 1016 ++++++++++++++++++ tests/query_test/test_aggregation.py | 51 +- 29 files changed, 1479 insertions(+), 145 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/benchmarks/hash-benchmark.cc ---------------------------------------------------------------------- diff --git a/be/src/benchmarks/hash-benchmark.cc b/be/src/benchmarks/hash-benchmark.cc index 4f6d7f1..125310b 100644 --- a/be/src/benchmarks/hash-benchmark.cc +++ b/be/src/benchmarks/hash-benchmark.cc @@ -429,7 +429,7 @@ int main(int argc, char **argv) { Status status; scoped_ptr<LlvmCodeGen> codegen; - status = LlvmCodeGen::CreateImpalaCodegen(&obj_pool, "test", &codegen); + status = LlvmCodeGen::CreateImpalaCodegen(&obj_pool, NULL, "test", &codegen); if (!status.ok()) { cout << "Could not start codegen."; return -1; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/codegen/llvm-codegen-test.cc ---------------------------------------------------------------------- diff --git a/be/src/codegen/llvm-codegen-test.cc b/be/src/codegen/llvm-codegen-test.cc index 178baa2..1238dde 100644 --- a/be/src/codegen/llvm-codegen-test.cc +++ b/be/src/codegen/llvm-codegen-test.cc @@ -42,9 +42,9 @@ class LlvmCodeGenTest : public testing:: Test { ObjectPool pool; Status status; for (int i = 0; i < 10; ++i) { - LlvmCodeGen object1(&pool, "Test"); - LlvmCodeGen object2(&pool, "Test"); - LlvmCodeGen object3(&pool, "Test"); + LlvmCodeGen object1(&pool, NULL, "Test"); + LlvmCodeGen object2(&pool, NULL, "Test"); + LlvmCodeGen object3(&pool, NULL, "Test"); ASSERT_OK(object1.Init(unique_ptr<Module>(new Module("Test", object1.context())))); ASSERT_OK(object2.Init(unique_ptr<Module>(new Module("Test", object2.context())))); @@ -53,16 +53,16 @@ class LlvmCodeGenTest : public testing:: Test { } // Wrapper to call private test-only methods on LlvmCodeGen object - static Status CreateFromFile(ObjectPool* pool, const string& filename, - scoped_ptr<LlvmCodeGen>* codegen) { - return LlvmCodeGen::CreateFromFile(pool, filename, "test", codegen); + static Status CreateFromFile( + ObjectPool* pool, const string& filename, scoped_ptr<LlvmCodeGen>* codegen) { + return LlvmCodeGen::CreateFromFile(pool, NULL, filename, "test", codegen); } static LlvmCodeGen* CreateCodegen(ObjectPool* pool) { - LlvmCodeGen* codegen = pool->Add(new LlvmCodeGen(pool, "Test")); + LlvmCodeGen* codegen = pool->Add(new LlvmCodeGen(pool, NULL, "Test")); if (codegen != NULL) { - Status status = codegen->Init( - unique_ptr<Module>(new Module("Test", codegen->context()))); + Status status = + codegen->Init(unique_ptr<Module>(new Module("Test", codegen->context()))); if (!status.ok()) return NULL; } return codegen; @@ -82,10 +82,7 @@ class LlvmCodeGenTest : public testing:: Test { return codegen->VerifyFunction(fn); } - static Status FinalizeModule(LlvmCodeGen* codegen) { - return codegen->FinalizeModule(); - } - + static Status FinalizeModule(LlvmCodeGen* codegen) { return codegen->FinalizeModule(); } }; // Simple test to just make and destroy llvmcodegen objects. LLVM @@ -109,7 +106,8 @@ TEST_F(LlvmCodeGenTest, BadIRFile) { ObjectPool pool; string module_file = "NonExistentFile.ir"; scoped_ptr<LlvmCodeGen> codegen; - EXPECT_FALSE(LlvmCodeGenTest::CreateFromFile(&pool, module_file.c_str(), &codegen).ok()); + EXPECT_FALSE( + LlvmCodeGenTest::CreateFromFile(&pool, module_file.c_str(), &codegen).ok()); } // IR for the generated linner loop @@ -289,7 +287,7 @@ TEST_F(LlvmCodeGenTest, StringValue) { ObjectPool pool; scoped_ptr<LlvmCodeGen> codegen; - ASSERT_OK(LlvmCodeGen::CreateImpalaCodegen(&pool, "test", &codegen)); + ASSERT_OK(LlvmCodeGen::CreateImpalaCodegen(&pool, NULL, "test", &codegen)); EXPECT_TRUE(codegen.get() != NULL); string str("Test"); @@ -332,7 +330,7 @@ TEST_F(LlvmCodeGenTest, MemcpyTest) { ObjectPool pool; scoped_ptr<LlvmCodeGen> codegen; - ASSERT_OK(LlvmCodeGen::CreateImpalaCodegen(&pool, "test", &codegen)); + ASSERT_OK(LlvmCodeGen::CreateImpalaCodegen(&pool, NULL, "test", &codegen)); ASSERT_TRUE(codegen.get() != NULL); LlvmCodeGen::FnPrototype prototype(codegen.get(), "MemcpyTest", codegen->void_type()); @@ -379,13 +377,13 @@ TEST_F(LlvmCodeGenTest, HashTest) { // Loop to test both the sse4 on/off paths for (int i = 0; i < 2; ++i) { scoped_ptr<LlvmCodeGen> codegen; - ASSERT_OK(LlvmCodeGen::CreateImpalaCodegen(&pool, "test", &codegen)); + ASSERT_OK(LlvmCodeGen::CreateImpalaCodegen(&pool, NULL, "test", &codegen)); ASSERT_TRUE(codegen.get() != NULL); - Value* llvm_data1 = codegen->CastPtrToLlvmPtr(codegen->ptr_type(), - const_cast<char*>(data1)); - Value* llvm_data2 = codegen->CastPtrToLlvmPtr(codegen->ptr_type(), - const_cast<char*>(data2)); + Value* llvm_data1 = + codegen->CastPtrToLlvmPtr(codegen->ptr_type(), const_cast<char*>(data1)); + Value* llvm_data2 = + codegen->CastPtrToLlvmPtr(codegen->ptr_type(), const_cast<char*>(data2)); Value* llvm_len1 = codegen->GetIntConstant(TYPE_INT, strlen(data1)); Value* llvm_len2 = codegen->GetIntConstant(TYPE_INT, strlen(data2)); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/codegen/llvm-codegen.cc ---------------------------------------------------------------------- diff --git a/be/src/codegen/llvm-codegen.cc b/be/src/codegen/llvm-codegen.cc index f6e7e97..596f48a 100644 --- a/be/src/codegen/llvm-codegen.cc +++ b/be/src/codegen/llvm-codegen.cc @@ -25,9 +25,9 @@ #include <gutil/strings/substitute.h> #include <llvm/ADT/Triple.h> -#include <llvm/Analysis/TargetTransformInfo.h> #include <llvm/Analysis/InstructionSimplify.h> #include <llvm/Analysis/Passes.h> +#include <llvm/Analysis/TargetTransformInfo.h> #include <llvm/Bitcode/ReaderWriter.h> #include <llvm/ExecutionEngine/ExecutionEngine.h> #include <llvm/ExecutionEngine/MCJIT.h> @@ -52,17 +52,18 @@ #include <llvm/Transforms/Utils/BasicBlockUtils.h> #include <llvm/Transforms/Utils/Cloning.h> -#include "common/logging.h" #include "codegen/codegen-anyval.h" #include "codegen/codegen-symbol-emitter.h" #include "codegen/impala-ir-data.h" #include "codegen/instruction-counter.h" #include "codegen/mcjit-mem-mgr.h" +#include "common/logging.h" #include "impala-ir/impala-ir-names.h" #include "runtime/descriptors.h" #include "runtime/hdfs-fs-cache.h" #include "runtime/lib-cache.h" #include "runtime/mem-pool.h" +#include "runtime/mem-tracker.h" #include "runtime/string-value.h" #include "runtime/timestamp-value.h" #include "util/cpu-info.h" @@ -76,7 +77,7 @@ using namespace llvm; using namespace strings; using std::fstream; -using std::unique_ptr; +using std::move; DEFINE_bool(print_llvm_ir_instruction_count, false, "if true, prints the instruction counts of all JIT'd functions"); @@ -200,7 +201,7 @@ void LlvmCodeGen::InitializeLlvm(bool load_backend) { ObjectPool init_pool; scoped_ptr<LlvmCodeGen> init_codegen; - Status status = LlvmCodeGen::CreateFromMemory(&init_pool, "init", &init_codegen); + Status status = LlvmCodeGen::CreateFromMemory(&init_pool, NULL, "init", &init_codegen); ParseGVForFunctions(init_codegen->module_, &gv_ref_ir_fns_); // Validate the module by verifying that functions for all IRFunction::Type @@ -213,16 +214,18 @@ void LlvmCodeGen::InitializeLlvm(bool load_backend) { } } -LlvmCodeGen::LlvmCodeGen(ObjectPool* pool, const string& id) : - id_(id), - profile_(pool, "CodeGen"), - optimizations_enabled_(false), - is_corrupt_(false), - is_compiled_(false), - context_(new llvm::LLVMContext()), - module_(NULL), - loaded_functions_(IRFunction::FN_END, NULL) { - +LlvmCodeGen::LlvmCodeGen( + ObjectPool* pool, MemTracker* parent_mem_tracker, const string& id) + : id_(id), + profile_(pool, "CodeGen"), + mem_tracker_(new MemTracker(&profile_, -1, "CodeGen", parent_mem_tracker)), + optimizations_enabled_(false), + is_corrupt_(false), + is_compiled_(false), + context_(new llvm::LLVMContext()), + module_(NULL), + memory_manager_(NULL), + loaded_functions_(IRFunction::FN_END, NULL) { DCHECK(llvm_initialized_) << "Must call LlvmCodeGen::InitializeLlvm first."; load_module_timer_ = ADD_TIMER(&profile_, "LoadTime"); @@ -235,9 +238,9 @@ LlvmCodeGen::LlvmCodeGen(ObjectPool* pool, const string& id) : num_instructions_ = ADD_COUNTER(&profile_, "NumInstructions", TUnit::UNIT); } -Status LlvmCodeGen::CreateFromFile(ObjectPool* pool, +Status LlvmCodeGen::CreateFromFile(ObjectPool* pool, MemTracker* parent_mem_tracker, const string& file, const string& id, scoped_ptr<LlvmCodeGen>* codegen) { - codegen->reset(new LlvmCodeGen(pool, id)); + codegen->reset(new LlvmCodeGen(pool, parent_mem_tracker, id)); SCOPED_TIMER((*codegen)->profile_.total_time_counter()); unique_ptr<Module> loaded_module; @@ -246,9 +249,9 @@ Status LlvmCodeGen::CreateFromFile(ObjectPool* pool, return (*codegen)->Init(std::move(loaded_module)); } -Status LlvmCodeGen::CreateFromMemory(ObjectPool* pool, const string& id, - scoped_ptr<LlvmCodeGen>* codegen) { - codegen->reset(new LlvmCodeGen(pool, id)); +Status LlvmCodeGen::CreateFromMemory(ObjectPool* pool, MemTracker* parent_mem_tracker, + const string& id, scoped_ptr<LlvmCodeGen>* codegen) { + codegen->reset(new LlvmCodeGen(pool, parent_mem_tracker, id)); SCOPED_TIMER((*codegen)->profile_.total_time_counter()); // Select the appropriate IR version. We cannot use LLVM IR with SSE4.2 instructions on @@ -375,9 +378,9 @@ void LlvmCodeGen::StripGlobalCtorsDtors(llvm::Module* module) { if (destructors != NULL) destructors->eraseFromParent(); } -Status LlvmCodeGen::CreateImpalaCodegen( - ObjectPool* pool, const string& id, scoped_ptr<LlvmCodeGen>* codegen_ret) { - RETURN_IF_ERROR(CreateFromMemory(pool, id, codegen_ret)); +Status LlvmCodeGen::CreateImpalaCodegen(ObjectPool* pool, MemTracker* parent_mem_tracker, + const string& id, scoped_ptr<LlvmCodeGen>* codegen_ret) { + RETURN_IF_ERROR(CreateFromMemory(pool, parent_mem_tracker, id, codegen_ret)); LlvmCodeGen* codegen = codegen_ret->get(); // Parse module for cross compiled functions and types @@ -422,8 +425,9 @@ Status LlvmCodeGen::Init(unique_ptr<Module> module) { EngineBuilder builder(std::move(module)); builder.setEngineKind(EngineKind::JIT); builder.setOptLevel(opt_level); - builder.setMCJITMemoryManager( - unique_ptr<ImpalaMCJITMemoryManager>(new ImpalaMCJITMemoryManager())); + unique_ptr<ImpalaMCJITMemoryManager> memory_manager(new ImpalaMCJITMemoryManager); + memory_manager_ = memory_manager.get(); + builder.setMCJITMemoryManager(move(memory_manager)); builder.setMCPU(cpu_name_); builder.setMAttrs(cpu_attrs_); builder.setErrorStr(&error_string_); @@ -464,6 +468,10 @@ void LlvmCodeGen::SetupJITListeners() { } LlvmCodeGen::~LlvmCodeGen() { + if (memory_manager_ != NULL) mem_tracker_->Release(memory_manager_->bytes_tracked()); + if (mem_tracker_->parent() != NULL) mem_tracker_->UnregisterFromParent(); + mem_tracker_.reset(); + // Execution engine executes callback on event listener, so tear down engine first. execution_engine_.reset(); symbol_emitter_.reset(); @@ -753,8 +761,14 @@ bool LlvmCodeGen::VerifyFunction(Function* fn) { return true; } -LlvmCodeGen::FnPrototype::FnPrototype(LlvmCodeGen* codegen, const string& name, - Type* ret_type) : codegen_(codegen), name_(name), ret_type_(ret_type) { +void LlvmCodeGen::SetNoInline(llvm::Function* function) const { + function->removeFnAttr(llvm::Attribute::AlwaysInline); + function->addFnAttr(llvm::Attribute::NoInline); +} + +LlvmCodeGen::FnPrototype::FnPrototype( + LlvmCodeGen* codegen, const string& name, Type* ret_type) + : codegen_(codegen), name_(name), ret_type_(ret_type) { DCHECK(!codegen_->is_compiled_) << "Not valid to add additional functions"; } @@ -927,10 +941,15 @@ Status LlvmCodeGen::FinalizeModule() { // Don't waste time optimizing module if there are no functions to JIT. This can happen // if the codegen object is created but no functions are successfully codegen'd. - if (fns_to_jit_compile_.empty()) return Status::OK(); + if (fns_to_jit_compile_.empty()) { + DestroyModule(); + return Status::OK(); + } RETURN_IF_ERROR(FinalizeLazyMaterialization()); - if (optimizations_enabled_ && !FLAGS_disable_optimization_passes) OptimizeModule(); + if (optimizations_enabled_ && !FLAGS_disable_optimization_passes) { + RETURN_IF_ERROR(OptimizeModule()); + } if (FLAGS_opt_module_dir.size() != 0) { string path = FLAGS_opt_module_dir + "/" + id_ + "_opt.ll"; @@ -949,17 +968,28 @@ Status LlvmCodeGen::FinalizeModule() { execution_engine_->finalizeObject(); } - // Get pointers to all codegen'd functions. + // Get pointers to all codegen'd functions for (int i = 0; i < fns_to_jit_compile_.size(); ++i) { Function* function = fns_to_jit_compile_[i].first; void* jitted_function = execution_engine_->getPointerToFunction(function); DCHECK(jitted_function != NULL) << "Failed to jit " << function->getName().data(); *fns_to_jit_compile_[i].second = jitted_function; } + + DestroyModule(); + + // Track the memory consumed by the compiled code. + int64_t bytes_allocated = memory_manager_->bytes_allocated(); + if (!mem_tracker_->TryConsume(bytes_allocated)) { + const string& msg = Substitute( + "Failed to allocate '$0' bytes for compiled code module", bytes_allocated); + return mem_tracker_->MemLimitExceeded(NULL, msg, bytes_allocated); + } + memory_manager_->set_bytes_tracked(bytes_allocated); return Status::OK(); } -void LlvmCodeGen::OptimizeModule() { +Status LlvmCodeGen::OptimizeModule() { SCOPED_TIMER(optimization_timer_); // This pass manager will construct optimizations passes that are "typical" for @@ -1003,6 +1033,14 @@ void LlvmCodeGen::OptimizeModule() { COUNTER_SET(num_functions_, counter.GetCount(InstructionCounter::TOTAL_FUNCTIONS)); COUNTER_SET(num_instructions_, counter.GetCount(InstructionCounter::TOTAL_INSTS)); + int64_t estimated_memory = ESTIMATED_OPTIMIZER_BYTES_PER_INST + * counter.GetCount(InstructionCounter::TOTAL_INSTS); + if (!mem_tracker_->TryConsume(estimated_memory)) { + const string& msg = Substitute( + "Codegen failed to reserve '$0' bytes for optimization", estimated_memory); + return mem_tracker_->MemLimitExceeded(NULL, msg, estimated_memory); + } + // Create and run function pass manager unique_ptr<legacy::FunctionPassManager> fn_pass_manager( new legacy::FunctionPassManager(module_)); @@ -1027,6 +1065,22 @@ void LlvmCodeGen::OptimizeModule() { VLOG(1) << counter.PrintCounters(); } } + + mem_tracker_->Release(estimated_memory); + return Status::OK(); +} + +void LlvmCodeGen::DestroyModule() { + // Clear all references to LLVM objects owned by the module. + loaded_functions_.clear(); + codegend_functions_.clear(); + registered_exprs_map_.clear(); + registered_exprs_.clear(); + llvm_intrinsics_.clear(); + hash_fns_.clear(); + fns_to_jit_compile_.clear(); + execution_engine_->removeModule(module_); + module_ = NULL; } void LlvmCodeGen::AddFunctionToJit(Function* fn, void** fn_ptr) { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/codegen/llvm-codegen.h ---------------------------------------------------------------------- diff --git a/be/src/codegen/llvm-codegen.h b/be/src/codegen/llvm-codegen.h index 21f9f71..8853d7e 100644 --- a/be/src/codegen/llvm-codegen.h +++ b/be/src/codegen/llvm-codegen.h @@ -72,6 +72,7 @@ namespace llvm { namespace impala { class CodegenSymbolEmitter; +class ImpalaMCJITMemoryManager; class SubExprElimination; class TupleDescriptor; @@ -124,6 +125,14 @@ class LlvmBuilder : public llvm::IRBuilder<> { /// instructions attached to the function object. Functions reachable by the function /// are also materialized recursively. // +/// Memory used for codegen is tracked via the MemTracker hierarchy. Codegen can use +/// significant memory for the IR module and for the optimization and compilation +/// algorithms. LLVM provides no way to directly track this transient memory - instead +/// the memory consumption is estimated based on the size of the IR module and released +/// once compilation finishes. Once compilation finishes, the size of the compiled +/// machine code is obtained from LLVM and and is tracked until the LlvmCodeGen object +/// is torn down and the compiled code is freed. +// class LlvmCodeGen { public: /// This function must be called once per process before any llvm API calls are @@ -138,13 +147,15 @@ class LlvmCodeGen { /// Creates a codegen instance for Impala initialized with the cross-compiled Impala IR. /// 'codegen' will contain the created object on success. + /// 'parent_mem_tracker' - if non-NULL, the CodeGen MemTracker is created under this. /// 'id' is used for outputting the IR module for debugging. - static Status CreateImpalaCodegen( - ObjectPool*, const std::string& id, boost::scoped_ptr<LlvmCodeGen>* codegen); + static Status CreateImpalaCodegen(ObjectPool*, MemTracker* parent_mem_tracker, + const std::string& id, boost::scoped_ptr<LlvmCodeGen>* codegen); /// Creates a LlvmCodeGen instance initialized with the module bitcode from 'file'. /// 'codegen' will contain the created object on success. - static Status CreateFromFile(ObjectPool*, const std::string& file, const std::string& id, + static Status CreateFromFile(ObjectPool*, MemTracker* parent_mem_tracker, + const std::string& file, const std::string& id, boost::scoped_ptr<LlvmCodeGen>* codegen); /// Removes all jit compiled dynamically linked functions from the process. @@ -202,8 +213,8 @@ class LlvmCodeGen { /// If 'print_ir' is true, the generated llvm::Function's IR will be printed when /// GetIR() is called. Avoid doing so for IR function prototypes generated for /// externally defined native function. - llvm::Function* GeneratePrototype(LlvmBuilder* builder = NULL, - llvm::Value** params = NULL, bool print_ir = true); + llvm::Function* GeneratePrototype( + LlvmBuilder* builder = NULL, llvm::Value** params = NULL, bool print_ir = true); private: friend class LlvmCodeGen; @@ -266,7 +277,9 @@ class LlvmCodeGen { /// Optimize and compile the module. This should be called after all functions to JIT /// have been added to the module via AddFunctionToJit(). If optimizations_enabled_ is - /// false, the module will not be optimized before compilation. + /// false, the module will not be optimized before compilation. After FinalizeModule() + /// is called, the LLVM module is destroyed and it is invalid to call any LlvmCodegen + /// functions. Status FinalizeModule(); /// Replaces all instructions in 'caller' that call 'target_name' with a call @@ -357,6 +370,10 @@ class LlvmCodeGen { llvm::Function* GetFnvHashFunction(int num_bytes = -1); llvm::Function* GetMurmurHashFunction(int num_bytes = -1); + /// Set the NoInline attribute on 'function' and remove the AlwaysInline attribute if + /// present. + void SetNoInline(llvm::Function* function) const; + /// Allocate stack storage for local variables. This is similar to traditional c, where /// all the variables must be declared at the top of the function. This helper can be /// called from anywhere and will add a stack allocation for 'var' at the beginning of @@ -454,6 +471,15 @@ class LlvmCodeGen { /// this LlvmCodeGen object. The module must be on the local filesystem. Status LinkModule(const std::string& file); + /// If there are more than this number of expr trees (or functions that evaluate + /// expressions), avoid inlining avoid inlining for the exprs exceeding this threshold. + static const int CODEGEN_INLINE_EXPRS_THRESHOLD = 100; + + /// If there are more than this number of expr trees (or functions that evaluate + /// expressions), avoid inlining the function that evaluates the expression batch + /// into the calling function. + static const int CODEGEN_INLINE_EXPR_BATCH_THRESHOLD = 25; + private: friend class ExprCodegenTest; friend class LlvmCodeGenTest; @@ -471,11 +497,12 @@ class LlvmCodeGen { /// Parses all the global variables in 'module' and adds any functions referenced by /// them to the set 'ref_fns' if they are not defined in the Impalad native code. /// These functions need to be materialized to avoid linking error. - static void ParseGVForFunctions(llvm::Module* module, - boost::unordered_set<string>* ref_fns); + static void ParseGVForFunctions( + llvm::Module* module, boost::unordered_set<string>* ref_fns); /// Top level codegen object. 'module_id' is used for debugging when outputting the IR. - LlvmCodeGen(ObjectPool* pool, const std::string& module_id); + LlvmCodeGen( + ObjectPool* pool, MemTracker* parent_mem_tracker, const std::string& module_id); /// Initializes the jitter and execution engine with the given module. Status Init(std::unique_ptr<llvm::Module> module); @@ -484,8 +511,8 @@ class LlvmCodeGen { /// 'codegen' will contain the created object on success. Note that the functions /// are not materialized. Getting a reference to the function via GetFunction() /// will materialize the function and its callees recursively. - static Status CreateFromMemory(ObjectPool* pool, const std::string& id, - boost::scoped_ptr<LlvmCodeGen>* codegen); + static Status CreateFromMemory(ObjectPool* pool, MemTracker* parent_mem_tracker, + const std::string& id, boost::scoped_ptr<LlvmCodeGen>* codegen); /// Loads an LLVM module from 'file' which is the local path to the LLVM bitcode file. /// The functions in the module are not materialized. Getting a reference to the @@ -524,7 +551,7 @@ class LlvmCodeGen { void ResetVerification() { is_corrupt_ = false; } /// Optimizes the module. This includes pruning the module of any unused functions. - void OptimizeModule(); + Status OptimizeModule(); /// Clears generated hash fns. This is only used for testing. void ClearHashFns(); @@ -570,6 +597,10 @@ class LlvmCodeGen { /// there is error in materializing the module. Status FinalizeLazyMaterialization(); + /// Destroy the IR module, freeing memory used by the IR. Any machine code that was + /// generated is retained by the execution engine. + void DestroyModule(); + /// Whether InitializeLlvm() has been called. static bool llvm_initialized_; @@ -589,6 +620,10 @@ class LlvmCodeGen { /// Codegen counters RuntimeProfile profile_; + /// MemTracker used for tracking memory consumed by codegen. Connected to a parent + /// MemTracker if one was provided during initialization. + boost::scoped_ptr<MemTracker> mem_tracker_; + /// Time spent reading the .ir file from the file system. RuntimeProfile::Counter* load_module_timer_; @@ -639,6 +674,9 @@ class LlvmCodeGen { /// Execution/Jitting engine. std::unique_ptr<llvm::ExecutionEngine> execution_engine_; + /// The memory manager used by 'execution_engine_'. Owned by 'execution_engine_'. + ImpalaMCJITMemoryManager* memory_manager_; + /// Functions parsed from pre-compiled module. Indexed by ImpalaIR::Function enum std::vector<llvm::Function*> loaded_functions_; @@ -686,8 +724,28 @@ class LlvmCodeGen { /// 'symbol_emitter_' are called by 'execution_engine_' when code is emitted or freed. /// The lifetime of the symbol emitter must be longer than 'execution_engine_'. boost::scoped_ptr<CodegenSymbolEmitter> symbol_emitter_; -}; + /// Very rough estimate of memory in bytes that the IR and the intermediate data + /// structures used by the optimizer may consume per LLVM IR instruction to be + /// optimized (after dead code is removed). The number is chosen to avoid pathological + /// behaviour at either extreme: failing queries unnecessarily because the memory + /// estimate is too high versus having large amounts of untracked memory because the + /// estimate is too low. + /// + /// This was chosen by looking at the behaviour of TPC-H queries. Using the heap growth + /// profile from gperftools reveal that LLVM allocated ~9mb of memory for fragments with + /// ~17k total instructions in TPC-H Q2. Inspection of other TPC-H queries revealed + /// that a typical fragment from a TPC-H query is < 5,000 instructions, which translates + /// to 2.5MB, which is almost always lower than the runtime memory requirement of the + /// fragment - so we are unlikely to fail queries unnecessarily. + /// + /// This assumes optimizer memory usage scales linearly with instruction count. This is + /// true only if the size of functions is bounded, because some optimization passes + /// (e.g. global value numbering) use time and memory that is super-linear in relation + /// to the # of instructions in a function. So codegen should avoid generating + /// arbitrarily large function. + static constexpr int64_t ESTIMATED_OPTIMIZER_BYTES_PER_INST = 512; +}; } #endif http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/codegen/mcjit-mem-mgr.h ---------------------------------------------------------------------- diff --git a/be/src/codegen/mcjit-mem-mgr.h b/be/src/codegen/mcjit-mem-mgr.h index 313d175..6982cef 100644 --- a/be/src/codegen/mcjit-mem-mgr.h +++ b/be/src/codegen/mcjit-mem-mgr.h @@ -25,19 +25,53 @@ extern void *__dso_handle __attribute__ ((__visibility__ ("hidden"))); namespace impala { -/// Custom memory manager to resolve references to __dso_handle in cross-compiled IR. +/// Custom memory manager. It is needed for a couple of purposes. +/// +/// We use it as a way to resolve references to __dso_handle in cross-compiled IR. /// This uses the same approach as the legacy llvm JIT to handle __dso_handle. MCJIT /// doesn't handle those for us: see LLVM issue 18062. /// TODO: get rid of this by purging the cross-compiled IR of references to __dso_handle, /// which come from global variables with destructors. +/// +/// We also use it to track how much memory is allocated for compiled code. class ImpalaMCJITMemoryManager : public llvm::SectionMemoryManager { public: + ImpalaMCJITMemoryManager() : bytes_allocated_(0), bytes_tracked_(0){}; + virtual uint64_t getSymbolAddress(const std::string& name) override { if (name == "__dso_handle") return reinterpret_cast<uint64_t>(&__dso_handle); return SectionMemoryManager::getSymbolAddress(name); } -}; + virtual uint8_t* allocateCodeSection(uintptr_t size, unsigned alignment, + unsigned section_id, llvm::StringRef section_name) override { + bytes_allocated_ += size; + return llvm::SectionMemoryManager::allocateCodeSection( + size, alignment, section_id, section_name); + } + + virtual uint8_t* allocateDataSection(uintptr_t size, unsigned alignment, + unsigned section_id, llvm::StringRef section_name, bool is_read_only) override { + bytes_allocated_ += size; + return llvm::SectionMemoryManager::allocateDataSection( + size, alignment, section_id, section_name, is_read_only); + } + + int64_t bytes_allocated() const { return bytes_allocated_; } + int64_t bytes_tracked() const { return bytes_tracked_; } + void set_bytes_tracked(int64_t bytes_tracked) { + DCHECK_LE(bytes_tracked, bytes_allocated_); + bytes_tracked_ = bytes_tracked; + } + + private: + /// Total bytes allocated for the compiled code. + int64_t bytes_allocated_; + + /// Total bytes already tracked by MemTrackers. <= 'bytes_allocated_'. + /// Needed to release the correct amount from the MemTracker when done. + int64_t bytes_tracked_; +}; } #endif http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/aggregation-node.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/aggregation-node.cc b/be/src/exec/aggregation-node.cc index f3b054e..822fc5d 100644 --- a/be/src/exec/aggregation-node.cc +++ b/be/src/exec/aggregation-node.cc @@ -164,6 +164,9 @@ Status AggregationNode::Prepare(RuntimeState* state) { void AggregationNode::Codegen(RuntimeState* state) { DCHECK(state->ShouldCodegen()); + ExecNode::Codegen(state); + if (IsNodeCodegenDisabled()) return; + bool codegen_enabled = false; LlvmCodeGen* codegen = state->codegen(); DCHECK(codegen != NULL); @@ -178,7 +181,6 @@ void AggregationNode::Codegen(RuntimeState* state) { } } runtime_profile()->AddCodegenMsg(codegen_enabled); - ExecNode::Codegen(state); } Status AggregationNode::Open(RuntimeState* state) { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/exchange-node.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/exchange-node.cc b/be/src/exec/exchange-node.cc index 2b2b033..0bbd42b 100644 --- a/be/src/exec/exchange-node.cc +++ b/be/src/exec/exchange-node.cc @@ -92,11 +92,13 @@ Status ExchangeNode::Prepare(RuntimeState* state) { void ExchangeNode::Codegen(RuntimeState* state) { DCHECK(state->ShouldCodegen()); + ExecNode::Codegen(state); + if (IsNodeCodegenDisabled()) return; + if (is_merging_) { Status codegen_status = less_than_->Codegen(state); runtime_profile()->AddCodegenMsg(codegen_status.ok(), codegen_status); } - ExecNode::Codegen(state); } Status ExchangeNode::Open(RuntimeState* state) { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/exec-node.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/exec-node.cc b/be/src/exec/exec-node.cc index 95306ec..518a0dd 100644 --- a/be/src/exec/exec-node.cc +++ b/be/src/exec/exec-node.cc @@ -129,6 +129,7 @@ ExecNode::ExecNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl rows_returned_counter_(NULL), rows_returned_rate_(NULL), containing_subplan_(NULL), + disable_codegen_(tnode.disable_codegen), is_closed_(false) { InitRuntimeProfile(PrintPlanNodeType(tnode.node_type)); } @@ -465,6 +466,10 @@ void ExecNode::AddCodegenDisabledMessage(RuntimeState* state) { } } +bool ExecNode::IsNodeCodegenDisabled() const { + return disable_codegen_; +} + // Codegen for EvalConjuncts. The generated signature is // For a node with two conjunct predicates // define i1 @EvalConjuncts(%"class.impala::ExprContext"** %ctxs, i32 %num_ctxs, @@ -507,6 +512,10 @@ Status ExecNode::CodegenEvalConjuncts(LlvmCodeGen* codegen, for (int i = 0; i < conjunct_ctxs.size(); ++i) { RETURN_IF_ERROR( conjunct_ctxs[i]->root()->GetCodegendComputeFn(codegen, &conjunct_fns[i])); + if (i >= LlvmCodeGen::CODEGEN_INLINE_EXPRS_THRESHOLD) { + // Avoid bloating EvalConjuncts by inlining everything into it. + codegen->SetNoInline(conjunct_fns[i]); + } } // Construct function signature to match @@ -566,12 +575,16 @@ Status ExecNode::CodegenEvalConjuncts(LlvmCodeGen* codegen, builder.CreateRet(codegen->true_value()); } + // Avoid inlining EvalConjuncts into caller if it is large. + if (conjunct_ctxs.size() > LlvmCodeGen::CODEGEN_INLINE_EXPR_BATCH_THRESHOLD) { + codegen->SetNoInline(*fn); + } + *fn = codegen->FinalizeFunction(*fn); if (*fn == NULL) { return Status("ExecNode::CodegenEvalConjuncts(): codegen'd EvalConjuncts() function " - "failed verification, see log"); + "failed verification, see log"); } return Status::OK(); } - } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/exec-node.h ---------------------------------------------------------------------- diff --git a/be/src/exec/exec-node.h b/be/src/exec/exec-node.h index 1dd2d32..6a7737b 100644 --- a/be/src/exec/exec-node.h +++ b/be/src/exec/exec-node.h @@ -184,6 +184,10 @@ class ExecNode { MemTracker* mem_tracker() { return mem_tracker_.get(); } MemTracker* expr_mem_tracker() { return expr_mem_tracker_.get(); } + /// Return true if codegen was disabled by the planner for this ExecNode. Does not + /// check to see if codegen was enabled for the enclosing fragment. + bool IsNodeCodegenDisabled() const; + /// Add codegen disabled message if codegen is disabled for this ExecNode. void AddCodegenDisabledMessage(RuntimeState* state); @@ -276,6 +280,9 @@ class ExecNode { /// Valid to call in or after Prepare(). bool IsInSubplan() const { return containing_subplan_ != NULL; } + /// If true, codegen should be disabled for this exec node. + const bool disable_codegen_; + /// Create a single exec node derived from thrift node; place exec node in 'pool'. static Status CreateNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs, ExecNode** node, RuntimeState* state); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/hash-join-node.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hash-join-node.cc b/be/src/exec/hash-join-node.cc index a18f356..77807f6 100644 --- a/be/src/exec/hash-join-node.cc +++ b/be/src/exec/hash-join-node.cc @@ -134,14 +134,13 @@ Status HashJoinNode::Prepare(RuntimeState* state) { AddExprCtxsToFree(other_join_conjunct_ctxs_); // TODO: default buckets - const bool stores_nulls = join_op_ == TJoinOp::RIGHT_OUTER_JOIN || - join_op_ == TJoinOp::FULL_OUTER_JOIN || - std::accumulate(is_not_distinct_from_.begin(), is_not_distinct_from_.end(), false, - std::logical_or<bool>()); + const bool stores_nulls = join_op_ == TJoinOp::RIGHT_OUTER_JOIN + || join_op_ == TJoinOp::FULL_OUTER_JOIN + || std::accumulate(is_not_distinct_from_.begin(), is_not_distinct_from_.end(), + false, std::logical_or<bool>()); hash_tbl_.reset(new OldHashTable(state, build_expr_ctxs_, probe_expr_ctxs_, - filter_expr_ctxs_, - child(1)->row_desc().tuple_descriptors().size(), stores_nulls, - is_not_distinct_from_, state->fragment_hash_seed(), mem_tracker(), filters_)); + filter_expr_ctxs_, child(1)->row_desc().tuple_descriptors().size(), stores_nulls, + is_not_distinct_from_, state->fragment_hash_seed(), mem_tracker(), filters_)); build_pool_.reset(new MemPool(mem_tracker())); AddCodegenDisabledMessage(state); return Status::OK(); @@ -149,6 +148,9 @@ Status HashJoinNode::Prepare(RuntimeState* state) { void HashJoinNode::Codegen(RuntimeState* state) { DCHECK(state->ShouldCodegen()); + ExecNode::Codegen(state); + if (IsNodeCodegenDisabled()) return; + LlvmCodeGen* codegen = state->codegen(); DCHECK(codegen != NULL); bool build_codegen_enabled = false; @@ -178,7 +180,6 @@ void HashJoinNode::Codegen(RuntimeState* state) { } runtime_profile()->AddCodegenMsg(build_codegen_enabled, "", "Build Side"); runtime_profile()->AddCodegenMsg(probe_codegen_enabled, "", "Probe Side"); - ExecNode::Codegen(state); } Status HashJoinNode::Reset(RuntimeState* state) { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/hash-table.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hash-table.cc b/be/src/exec/hash-table.cc index 8d45ec7..f4ecb67 100644 --- a/be/src/exec/hash-table.cc +++ b/be/src/exec/hash-table.cc @@ -755,13 +755,18 @@ Status HashTableCtx::CodegenEvalRow(LlvmCodeGen* codegen, bool build, Function** if (!status.ok()) { (*fn)->eraseFromParent(); // deletes function *fn = NULL; - return Status(Substitute("Problem with HashTableCtx::CodegenEvalRow(): $0", - status.GetDetail())); + return Status(Substitute( + "Problem with HashTableCtx::CodegenEvalRow(): $0", status.GetDetail())); } - Value* get_expr_ctx_args[] = { this_ptr, codegen->GetIntConstant(TYPE_INT, i) }; + // Avoid bloating function by inlining too many exprs into it. + if (i >= LlvmCodeGen::CODEGEN_INLINE_EXPRS_THRESHOLD) { + codegen->SetNoInline(expr_fn); + } + + Value* get_expr_ctx_args[] = {this_ptr, codegen->GetIntConstant(TYPE_INT, i)}; Value* ctx_arg = builder.CreateCall(get_expr_ctx_fn, get_expr_ctx_args, "expr_ctx"); - Value* expr_fn_args[] = { ctx_arg, row }; + Value* expr_fn_args[] = {ctx_arg, row}; CodegenAnyVal result = CodegenAnyVal::CreateCallWrapped( codegen, &builder, ctxs[i]->root()->type(), expr_fn, expr_fn_args, "result"); Value* is_null = result.GetIsNull(); @@ -800,10 +805,15 @@ Status HashTableCtx::CodegenEvalRow(LlvmCodeGen* codegen, bool build, Function** } builder.CreateRet(has_null); + // Avoid inlining a large EvalRow() function into caller. + if (ctxs.size() > LlvmCodeGen::CODEGEN_INLINE_EXPR_BATCH_THRESHOLD) { + codegen->SetNoInline(*fn); + } + *fn = codegen->FinalizeFunction(*fn); if (*fn == NULL) { return Status("Codegen'd HashTableCtx::EvalRow() function failed verification, " - "see log"); + "see log"); } return Status::OK(); } @@ -972,6 +982,11 @@ Status HashTableCtx::CodegenHashRow(LlvmCodeGen* codegen, bool use_murmur, Funct } builder.CreateRet(hash_result); + + // Avoid inlining into caller if there are many exprs. + if (build_expr_ctxs_.size() > LlvmCodeGen::CODEGEN_INLINE_EXPR_BATCH_THRESHOLD) { + codegen->SetNoInline(*fn); + } *fn = codegen->FinalizeFunction(*fn); if (*fn == NULL) { return Status( @@ -1095,12 +1110,16 @@ Status HashTableCtx::CodegenEquals(LlvmCodeGen* codegen, bool force_null_equalit if (!status.ok()) { (*fn)->eraseFromParent(); // deletes function *fn = NULL; - return Status(Substitute("Problem with HashTableCtx::CodegenEquals: $0", - status.GetDetail())); + return Status( + Substitute("Problem with HashTableCtx::CodegenEquals: $0", status.GetDetail())); + } + if (build_expr_ctxs_.size() > LlvmCodeGen::CODEGEN_INLINE_EXPRS_THRESHOLD) { + // Avoid bloating function by inlining too many exprs into it. + codegen->SetNoInline(expr_fn); } // Load ExprContext* from 'build_expr_ctxs_'. - Value* get_expr_ctx_args[] = { this_ptr, codegen->GetIntConstant(TYPE_INT, i) }; + Value* get_expr_ctx_args[] = {this_ptr, codegen->GetIntConstant(TYPE_INT, i)}; Value* ctx_arg = builder.CreateCall(get_expr_ctx_fn, get_expr_ctx_args, "expr_ctx"); // Evaluate the expression. @@ -1156,10 +1175,14 @@ Status HashTableCtx::CodegenEquals(LlvmCodeGen* codegen, bool force_null_equalit builder.SetInsertPoint(false_block); builder.CreateRet(codegen->false_value()); + // Avoid inlining into caller if it is large. + if (build_expr_ctxs_.size() > LlvmCodeGen::CODEGEN_INLINE_EXPR_BATCH_THRESHOLD) { + codegen->SetNoInline(*fn); + } *fn = codegen->FinalizeFunction(*fn); if (*fn == NULL) { return Status("Codegen'd HashTableCtx::Equals() function failed verification, " - "see log"); + "see log"); } return Status::OK(); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/hdfs-scan-node-base.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-scan-node-base.cc b/be/src/exec/hdfs-scan-node-base.cc index e616faf..5b97e76 100644 --- a/be/src/exec/hdfs-scan-node-base.cc +++ b/be/src/exec/hdfs-scan-node-base.cc @@ -297,6 +297,10 @@ Status HdfsScanNodeBase::Prepare(RuntimeState* state) { } void HdfsScanNodeBase::Codegen(RuntimeState* state) { + DCHECK(state->ShouldCodegen()); + ExecNode::Codegen(state); + if (IsNodeCodegenDisabled()) return; + // Create codegen'd functions for (int format = THdfsFileFormat::TEXT; format <= THdfsFileFormat::PARQUET; ++format) { vector<HdfsFileDesc*>& file_descs = @@ -343,7 +347,6 @@ void HdfsScanNodeBase::Codegen(RuntimeState* state) { } runtime_profile()->AddCodegenMsg(status.ok(), status, format_name); } - ExecNode::Codegen(state); } Status HdfsScanNodeBase::Open(RuntimeState* state) { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/hdfs-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-scanner.cc b/be/src/exec/hdfs-scanner.cc index 390085f..c6a5f37 100644 --- a/be/src/exec/hdfs-scanner.cc +++ b/be/src/exec/hdfs-scanner.cc @@ -332,6 +332,7 @@ Status HdfsScanner::CodegenWriteCompleteTuple(HdfsScanNodeBase* node, node->hdfs_table()->null_column_value().data(), node->hdfs_table()->null_column_value().size(), true, state->strict_mode()); if (fn == NULL) return Status("CodegenWriteSlot failed."); + if (i >= LlvmCodeGen::CODEGEN_INLINE_EXPRS_THRESHOLD) codegen->SetNoInline(fn); slot_fns.push_back(fn); } @@ -487,6 +488,10 @@ Status HdfsScanner::CodegenWriteCompleteTuple(HdfsScanNodeBase* node, fn->eraseFromParent(); return status; } + if (node->materialized_slots().size() + conjunct_idx + >= LlvmCodeGen::CODEGEN_INLINE_EXPRS_THRESHOLD) { + codegen->SetNoInline(conjunct_fn); + } Function* get_ctx_fn = codegen->GetFunction(IRFunction::HDFS_SCANNER_GET_CONJUNCT_CTX, false); @@ -505,6 +510,10 @@ Status HdfsScanner::CodegenWriteCompleteTuple(HdfsScanNodeBase* node, builder.SetInsertPoint(eval_fail_block); builder.CreateRet(codegen->false_value()); + if (node->materialized_slots().size() + conjunct_ctxs.size() + > LlvmCodeGen::CODEGEN_INLINE_EXPR_BATCH_THRESHOLD) { + codegen->SetNoInline(fn); + } *write_complete_tuple_fn = codegen->FinalizeFunction(fn); if (*write_complete_tuple_fn == NULL) { return Status("Failed to finalize write_complete_tuple_fn."); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/partitioned-aggregation-node.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/partitioned-aggregation-node.cc b/be/src/exec/partitioned-aggregation-node.cc index 01622b9..6cc36be 100644 --- a/be/src/exec/partitioned-aggregation-node.cc +++ b/be/src/exec/partitioned-aggregation-node.cc @@ -281,14 +281,16 @@ Status PartitionedAggregationNode::Prepare(RuntimeState* state) { void PartitionedAggregationNode::Codegen(RuntimeState* state) { DCHECK(state->ShouldCodegen()); + ExecNode::Codegen(state); + if (IsNodeCodegenDisabled()) return; + LlvmCodeGen* codegen = state->codegen(); DCHECK(codegen != NULL); TPrefetchMode::type prefetch_mode = state_->query_options().prefetch_mode; - Status codegen_status = - is_streaming_preagg_ ? CodegenProcessBatchStreaming(codegen, prefetch_mode) : - CodegenProcessBatch(codegen, prefetch_mode); + Status codegen_status = is_streaming_preagg_ ? + CodegenProcessBatchStreaming(codegen, prefetch_mode) : + CodegenProcessBatch(codegen, prefetch_mode); runtime_profile()->AddCodegenMsg(codegen_status.ok(), codegen_status); - ExecNode::Codegen(state); } Status PartitionedAggregationNode::Open(RuntimeState* state) { @@ -1553,7 +1555,8 @@ Status PartitionedAggregationNode::QueryMaintenance(RuntimeState* state) { // } // Status PartitionedAggregationNode::CodegenUpdateSlot(LlvmCodeGen* codegen, - AggFnEvaluator* evaluator, SlotDescriptor* slot_desc, Function** fn) { + AggFnEvaluator* evaluator, int evaluator_idx, SlotDescriptor* slot_desc, + Function** fn) { PointerType* fn_ctx_type = codegen->GetPtrType(FunctionContextImpl::LLVM_FUNCTIONCONTEXT_NAME); PointerType* expr_ctxs_type = @@ -1697,10 +1700,18 @@ Status PartitionedAggregationNode::CodegenUpdateSlot(LlvmCodeGen* codegen, builder.SetInsertPoint(ret_block); builder.CreateRetVoid(); + // Avoid producing huge UpdateTuple() function after inlining - LLVM's optimiser + // memory/CPU usage scales super-linearly with function size. + // E.g. compute stats on all columns of a 1000-column table previously took 4 minutes to + // codegen because all the UpdateSlot() functions were inlined. + if (evaluator_idx >= LlvmCodeGen::CODEGEN_INLINE_EXPRS_THRESHOLD) { + codegen->SetNoInline(*fn); + } + *fn = codegen->FinalizeFunction(*fn); if (*fn == NULL) { return Status("PartitionedAggregationNode::CodegenUpdateSlot(): codegen'd " - "UpdateSlot() function failed verification, see log"); + "UpdateSlot() function failed verification, see log"); } return Status::OK(); } @@ -1874,7 +1885,8 @@ Status PartitionedAggregationNode::CodegenUpdateTuple( builder.CreateStore(count_inc, slot_ptr); } else { Function* update_slot_fn; - RETURN_IF_ERROR(CodegenUpdateSlot(codegen, evaluator, slot_desc, &update_slot_fn)); + RETURN_IF_ERROR( + CodegenUpdateSlot(codegen, evaluator, i, slot_desc, &update_slot_fn)); Value* agg_fn_ctx_ptr = builder.CreateConstGEP1_32(agg_fn_ctxs_arg, i); Value* agg_fn_ctx = builder.CreateLoad(agg_fn_ctx_ptr, "agg_fn_ctx"); // Call GetExprCtx() to get the expression context. @@ -1887,6 +1899,12 @@ Status PartitionedAggregationNode::CodegenUpdateTuple( } builder.CreateRetVoid(); + // Avoid inlining big UpdateTuple function into outer loop - we're unlikely to get + // any benefit from it since the function call overhead will be amortized. + if (aggregate_evaluators_.size() > LlvmCodeGen::CODEGEN_INLINE_EXPR_BATCH_THRESHOLD) { + codegen->SetNoInline(*fn); + } + // CodegenProcessBatch() does the final optimizations. *fn = codegen->FinalizeFunction(*fn); if (*fn == NULL) { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/partitioned-aggregation-node.h ---------------------------------------------------------------------- diff --git a/be/src/exec/partitioned-aggregation-node.h b/be/src/exec/partitioned-aggregation-node.h index 4287cea..54840c7 100644 --- a/be/src/exec/partitioned-aggregation-node.h +++ b/be/src/exec/partitioned-aggregation-node.h @@ -643,7 +643,7 @@ class PartitionedAggregationNode : public ExecNode { /// Codegen UpdateSlot(). Returns non-OK status if codegen is unsuccessful. /// Assumes is_merge = false; Status CodegenUpdateSlot(LlvmCodeGen* codegen, AggFnEvaluator* evaluator, - SlotDescriptor* slot_desc, llvm::Function** fn); + int evaluator_idx, SlotDescriptor* slot_desc, llvm::Function** fn); /// Codegen a call to a function implementing the UDA interface with input values /// from 'input_vals'. 'dst_val' should contain the previous value of the aggregate http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/partitioned-hash-join-node.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/partitioned-hash-join-node.cc b/be/src/exec/partitioned-hash-join-node.cc index 61b7eb7..6073486 100644 --- a/be/src/exec/partitioned-hash-join-node.cc +++ b/be/src/exec/partitioned-hash-join-node.cc @@ -141,6 +141,10 @@ Status PartitionedHashJoinNode::Prepare(RuntimeState* state) { void PartitionedHashJoinNode::Codegen(RuntimeState* state) { DCHECK(state->ShouldCodegen()); + // Codegen the children node; + ExecNode::Codegen(state); + if (IsNodeCodegenDisabled()) return; + LlvmCodeGen* codegen = state->codegen(); DCHECK(codegen != NULL); @@ -150,11 +154,8 @@ void PartitionedHashJoinNode::Codegen(RuntimeState* state) { // Codegen the probe side. TPrefetchMode::type prefetch_mode = state->query_options().prefetch_mode; Status probe_codegen_status = CodegenProcessProbeBatch(codegen, prefetch_mode); - runtime_profile()->AddCodegenMsg(probe_codegen_status.ok(), probe_codegen_status, - "Probe Side"); - - // Codegen the children node; - ExecNode::Codegen(state); + runtime_profile()->AddCodegenMsg( + probe_codegen_status.ok(), probe_codegen_status, "Probe Side"); } Status PartitionedHashJoinNode::Open(RuntimeState* state) { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/sort-node.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/sort-node.cc b/be/src/exec/sort-node.cc index 7518223..1a8de83 100644 --- a/be/src/exec/sort-node.cc +++ b/be/src/exec/sort-node.cc @@ -51,8 +51,9 @@ Status SortNode::Prepare(RuntimeState* state) { state, child(0)->row_desc(), row_descriptor_, expr_mem_tracker())); AddExprCtxsToFree(sort_exec_exprs_); less_than_.reset(new TupleRowComparator(sort_exec_exprs_, is_asc_order_, nulls_first_)); - sorter_.reset(new Sorter(*less_than_.get(), sort_exec_exprs_.sort_tuple_slot_expr_ctxs(), - &row_descriptor_, mem_tracker(), runtime_profile(), state)); + sorter_.reset( + new Sorter(*less_than_.get(), sort_exec_exprs_.sort_tuple_slot_expr_ctxs(), + &row_descriptor_, mem_tracker(), runtime_profile(), state)); RETURN_IF_ERROR(sorter_->Init()); AddCodegenDisabledMessage(state); return Status::OK(); @@ -60,9 +61,11 @@ Status SortNode::Prepare(RuntimeState* state) { void SortNode::Codegen(RuntimeState* state) { DCHECK(state->ShouldCodegen()); + ExecNode::Codegen(state); + if (IsNodeCodegenDisabled()) return; + Status codegen_status = less_than_->Codegen(state); runtime_profile()->AddCodegenMsg(codegen_status.ok(), codegen_status); - ExecNode::Codegen(state); } Status SortNode::Open(RuntimeState* state) { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exec/topn-node.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/topn-node.cc b/be/src/exec/topn-node.cc index bd4caec..a7f11bd 100644 --- a/be/src/exec/topn-node.cc +++ b/be/src/exec/topn-node.cc @@ -84,6 +84,9 @@ Status TopNNode::Prepare(RuntimeState* state) { void TopNNode::Codegen(RuntimeState* state) { DCHECK(state->ShouldCodegen()); + ExecNode::Codegen(state); + if (IsNodeCodegenDisabled()) return; + LlvmCodeGen* codegen = state->codegen(); DCHECK(codegen != NULL); @@ -126,7 +129,6 @@ void TopNNode::Codegen(RuntimeState* state) { } } runtime_profile()->AddCodegenMsg(codegen_status.ok(), codegen_status); - ExecNode::Codegen(state); } Status TopNNode::Open(RuntimeState* state) { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exprs/aggregate-functions-ir.cc ---------------------------------------------------------------------- diff --git a/be/src/exprs/aggregate-functions-ir.cc b/be/src/exprs/aggregate-functions-ir.cc index 7ed6386..f6a91f4 100644 --- a/be/src/exprs/aggregate-functions-ir.cc +++ b/be/src/exprs/aggregate-functions-ir.cc @@ -1190,8 +1190,26 @@ void AggregateFunctions::HllUpdate(FunctionContext* ctx, const T& src, StringVal } } -void AggregateFunctions::HllMerge(FunctionContext* ctx, const StringVal& src, - StringVal* dst) { +// Specialize for DecimalVal to allow substituting decimal size. +template <> +void AggregateFunctions::HllUpdate( + FunctionContext* ctx, const DecimalVal& src, StringVal* dst) { + if (src.is_null) return; + DCHECK(!dst->is_null); + DCHECK_EQ(dst->len, HLL_LEN); + uint64_t hash_value = AnyValUtil::HashDecimal64( + src, Expr::GetConstantInt(*ctx, Expr::ARG_TYPE_SIZE, 0), HashUtil::FNV64_SEED); + if (hash_value != 0) { + // Use the lower bits to index into the number of streams and then + // find the first 1 bit after the index bits. + int idx = hash_value & (HLL_LEN - 1); + uint8_t first_one_bit = __builtin_ctzl(hash_value >> HLL_PRECISION) + 1; + dst->ptr[idx] = ::max(dst->ptr[idx], first_one_bit); + } +} + +void AggregateFunctions::HllMerge( + FunctionContext* ctx, const StringVal& src, StringVal* dst) { DCHECK(!dst->is_null); DCHECK(!src.is_null); DCHECK_EQ(dst->len, HLL_LEN); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exprs/anyval-util.h ---------------------------------------------------------------------- diff --git a/be/src/exprs/anyval-util.h b/be/src/exprs/anyval-util.h index 8c75247..4703266 100644 --- a/be/src/exprs/anyval-util.h +++ b/be/src/exprs/anyval-util.h @@ -133,12 +133,19 @@ class AnyValUtil { return HashUtil::MurmurHash2_64(&tv, 12, seed); } - static uint64_t Hash64(const DecimalVal& v, const FunctionContext::TypeDesc& t, - int64_t seed) { - switch (ColumnType::GetDecimalByteSize(t.precision)) { - case 4: return HashUtil::MurmurHash2_64(&v.val4, 4, seed); - case 8: return HashUtil::MurmurHash2_64(&v.val8, 8, seed); - case 16: return HashUtil::MurmurHash2_64(&v.val16, 16, seed); + static uint64_t Hash64( + const DecimalVal& v, const FunctionContext::TypeDesc& t, int64_t seed) { + return HashDecimal64(v, ColumnType::GetDecimalByteSize(t.precision), seed); + } + + static uint64_t HashDecimal64(const DecimalVal& v, int byte_size, int64_t seed) { + switch (byte_size) { + case 4: + return HashUtil::MurmurHash2_64(&v.val4, 4, seed); + case 8: + return HashUtil::MurmurHash2_64(&v.val8, 8, seed); + case 16: + return HashUtil::MurmurHash2_64(&v.val16, 16, seed); default: DCHECK(false); return 0; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exprs/expr-codegen-test.cc ---------------------------------------------------------------------- diff --git a/be/src/exprs/expr-codegen-test.cc b/be/src/exprs/expr-codegen-test.cc index 6ccbbf4..cbb5617 100644 --- a/be/src/exprs/expr-codegen-test.cc +++ b/be/src/exprs/expr-codegen-test.cc @@ -251,7 +251,8 @@ TEST_F(ExprCodegenTest, TestInlineConstants) { stringstream test_udf_file; test_udf_file << getenv("IMPALA_HOME") << "/be/build/latest/exprs/expr-codegen-test.ll"; scoped_ptr<LlvmCodeGen> codegen; - ASSERT_OK(LlvmCodeGen::CreateFromFile(&pool, test_udf_file.str(), "test", &codegen)); + ASSERT_OK( + LlvmCodeGen::CreateFromFile(&pool, NULL, test_udf_file.str(), "test", &codegen)); Function* fn = codegen->GetFunction(TEST_GET_CONSTANT_SYMBOL, false); ASSERT_TRUE(fn != NULL); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/exprs/expr.h ---------------------------------------------------------------------- diff --git a/be/src/exprs/expr.h b/be/src/exprs/expr.h index 552fee9..942eec0 100644 --- a/be/src/exprs/expr.h +++ b/be/src/exprs/expr.h @@ -361,6 +361,7 @@ class Expr { /// recognize if this node is a slotref in order to speed up GetValue() const bool is_slotref_; + /// analysis is done, types are fixed at this point const ColumnType type_; std::vector<Expr*> children_; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/runtime/lib-cache.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/lib-cache.cc b/be/src/runtime/lib-cache.cc index fe8d180..00c1159 100644 --- a/be/src/runtime/lib-cache.cc +++ b/be/src/runtime/lib-cache.cc @@ -403,8 +403,8 @@ Status LibCache::GetCacheEntryInternal(const string& hdfs_lib_file, LibType type ObjectPool pool; scoped_ptr<LlvmCodeGen> codegen; string module_id = filesystem::path((*entry)->local_path).stem().string(); - RETURN_IF_ERROR( - LlvmCodeGen::CreateFromFile(&pool, (*entry)->local_path, module_id, &codegen)); + RETURN_IF_ERROR(LlvmCodeGen::CreateFromFile( + &pool, NULL, (*entry)->local_path, module_id, &codegen)); codegen->GetSymbols(&(*entry)->symbols); } else { DCHECK_EQ(type, TYPE_JAR); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/be/src/runtime/runtime-state.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/runtime-state.cc b/be/src/runtime/runtime-state.cc index 40a4946..37d5fb0 100644 --- a/be/src/runtime/runtime-state.cc +++ b/be/src/runtime/runtime-state.cc @@ -96,16 +96,16 @@ RuntimeState::RuntimeState(const TQueryCtx& query_ctx) RuntimeState::~RuntimeState() { block_mgr_.reset(); + // Release codegen memory before tearing down trackers. + codegen_.reset(); + // query_mem_tracker_ must be valid as long as instance_mem_tracker_ is so // delete instance_mem_tracker_ first. // LogUsage() walks the MemTracker tree top-down when the memory limit is exceeded. // Break the link between the instance_mem_tracker and its parent (query_mem_tracker_) // before the instance_mem_tracker_ and its children are destroyed. - if (instance_mem_tracker_.get() != NULL) { - // May be NULL if InitMemTrackers() is not called, for example from tests. - instance_mem_tracker_->UnregisterFromParent(); - } - + // May be NULL if InitMemTrackers() is not called, for example from tests. + if (instance_mem_tracker_ != NULL) instance_mem_tracker_->UnregisterFromParent(); instance_mem_tracker_.reset(); query_mem_tracker_.reset(); } @@ -184,8 +184,8 @@ Status RuntimeState::CreateBlockMgr() { Status RuntimeState::CreateCodegen() { if (codegen_.get() != NULL) return Status::OK(); // TODO: add the fragment ID to the codegen ID as well - RETURN_IF_ERROR(LlvmCodeGen::CreateImpalaCodegen( - obj_pool_.get(), PrintId(fragment_instance_id()), &codegen_)); + RETURN_IF_ERROR(LlvmCodeGen::CreateImpalaCodegen(obj_pool_.get(), + instance_mem_tracker_.get(), PrintId(fragment_instance_id()), &codegen_)); codegen_->EnableOptimizations(true); profile_.AddChild(codegen_->runtime_profile()); return Status::OK(); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/common/thrift/PlanNodes.thrift ---------------------------------------------------------------------- diff --git a/common/thrift/PlanNodes.thrift b/common/thrift/PlanNodes.thrift index 6f863b2..233fbb2 100644 --- a/common/thrift/PlanNodes.thrift +++ b/common/thrift/PlanNodes.thrift @@ -463,32 +463,36 @@ struct TPlanNode { 6: required list<bool> nullable_tuples 7: optional list<Exprs.TExpr> conjuncts + // Set to true if codegen should be disabled for this plan node. Otherwise the plan + // node is codegen'd if the backend supports it. + 8: required bool disable_codegen + // one field per PlanNode subclass - 8: optional THdfsScanNode hdfs_scan_node - 9: optional THBaseScanNode hbase_scan_node - 23: optional TKuduScanNode kudu_scan_node - 10: optional TDataSourceScanNode data_source_node - 11: optional THashJoinNode hash_join_node - 12: optional TNestedLoopJoinNode nested_loop_join_node - 13: optional TAggregationNode agg_node - 14: optional TSortNode sort_node - 15: optional TUnionNode union_node - 16: optional TExchangeNode exchange_node - 17: optional TAnalyticNode analytic_node - 21: optional TUnnestNode unnest_node + 9: optional THdfsScanNode hdfs_scan_node + 10: optional THBaseScanNode hbase_scan_node + 11: optional TKuduScanNode kudu_scan_node + 12: optional TDataSourceScanNode data_source_node + 13: optional THashJoinNode hash_join_node + 14: optional TNestedLoopJoinNode nested_loop_join_node + 15: optional TAggregationNode agg_node + 16: optional TSortNode sort_node + 17: optional TUnionNode union_node + 18: optional TExchangeNode exchange_node + 19: optional TAnalyticNode analytic_node + 20: optional TUnnestNode unnest_node // Label that should be used to print this node to the user. - 18: optional string label + 21: optional string label // Additional details that should be printed to the user. This is node specific // e.g. table name, join strategy, etc. - 19: optional string label_detail + 22: optional string label_detail // Estimated execution stats generated by the planner. - 20: optional ExecStats.TExecStats estimated_stats + 23: optional ExecStats.TExecStats estimated_stats // Runtime filters assigned to this plan node - 22: optional list<TRuntimeFilterDesc> runtime_filters + 24: optional list<TRuntimeFilterDesc> runtime_filters } // A flattened representation of a tree of PlanNodes, obtained by depth-first http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/fe/src/main/java/org/apache/impala/planner/DistributedPlanner.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/planner/DistributedPlanner.java b/fe/src/main/java/org/apache/impala/planner/DistributedPlanner.java index 620ea56..24d7caa 100644 --- a/fe/src/main/java/org/apache/impala/planner/DistributedPlanner.java +++ b/fe/src/main/java/org/apache/impala/planner/DistributedPlanner.java @@ -818,6 +818,12 @@ public class DistributedPlanner { mergeFragment.getPlanRoot(), node.getAggInfo().getMergeAggInfo()); mergeAggNode.init(ctx_.getRootAnalyzer()); mergeAggNode.setLimit(limit); + // Merge of non-grouping agg only processes one tuple per Impala daemon - codegen + // will cost more than benefit. + if (!hasGrouping) { + mergeFragment.getPlanRoot().setDisableCodegen(true); + mergeAggNode.setDisableCodegen(true); + } // HAVING predicates can only be evaluated after the merge agg step node.transferConjuncts(mergeAggNode); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4db330e6/fe/src/main/java/org/apache/impala/planner/PlanNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/planner/PlanNode.java b/fe/src/main/java/org/apache/impala/planner/PlanNode.java index 141464e..312ea5b 100644 --- a/fe/src/main/java/org/apache/impala/planner/PlanNode.java +++ b/fe/src/main/java/org/apache/impala/planner/PlanNode.java @@ -120,6 +120,9 @@ abstract public class PlanNode extends TreeNode<PlanNode> { // set in computeCosts(); invalid: -1 protected long perHostMemCost_ = -1; + // If true, disable codegen for this plan node. + protected boolean disableCodegen_; + // Runtime filters assigned to this node. protected List<RuntimeFilter> runtimeFilters_ = Lists.newArrayList(); @@ -144,6 +147,7 @@ abstract public class PlanNode extends TreeNode<PlanNode> { cardinality_ = -1; numNodes_ = -1; displayName_ = displayName; + disableCodegen_ = false; } /** @@ -159,6 +163,7 @@ abstract public class PlanNode extends TreeNode<PlanNode> { cardinality_ = -1; numNodes_ = -1; displayName_ = displayName; + disableCodegen_ = node.disableCodegen_; } /** @@ -393,9 +398,10 @@ abstract public class PlanNode extends TreeNode<PlanNode> { msg.addToConjuncts(e.treeToThrift()); } // Serialize any runtime filters - for (RuntimeFilter filter: runtimeFilters_) { + for (RuntimeFilter filter : runtimeFilters_) { msg.addToRuntime_filters(filter.toThrift()); } + msg.setDisable_codegen(disableCodegen_); toThrift(msg); container.addToNodes(msg); // For the purpose of the BE consider ExchangeNodes to have no children. @@ -718,4 +724,8 @@ abstract public class PlanNode extends TreeNode<PlanNode> { return sortedConjuncts; } + + public void setDisableCodegen(boolean disableCodegen) { + disableCodegen_ = disableCodegen; + } }
