Repository: incubator-impala Updated Branches: refs/heads/master c1d70f814 -> 2c9b4a9ba
IMPALA-3906: Materialize implicitly referenced IR functions With lazy materialization of IR functions in the LLVM module, there is an assumption that functions not referenced by IR functions used in the query don't have to be materialized and can have their linkage types changed to being externally defined. These unmaterialized functions may be referenced by global variables in the LLVM module and LLVM resolves these references to their definitions in the native Impalad binary. These global variables are mostly arrays containing references to other global constants or boost and Impala functions included in the cross-compiled code. When compiling Impalad with optimization, gcc may actually inline some of the functions which the global variables in the LLVM modules reference and LLVM may have linking error if these referenced IR functions are not materialized as it can no longer find their definitions in the Impalad binary. This patch fixes the problem by parsing all the global variables in the LLVM module during Impalad's initialization and recording all the referenced functions which aren't defined in the Impalad binary and make sure they are always materialized. A second problem which this patch fixes is that linking in external LLVM module (e.g. UDF IR created by a user) may implicitly materialize some functions in the external module. Normally, we would expect functions to be materialized through the GetFunction() interface and LinkModule() seems to be an exception. This patch updates LinkModule() to also parse the functions from the external module just like we do for unmaterialized functions in GetFunction(). Change-Id: I147eb332adc3f272b7a7a4fb4415842432c05f77 Reviewed-on: http://gerrit.cloudera.org:8080/3792 Reviewed-by: Michael Ho <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/2c9b4a9b Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/2c9b4a9b Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/2c9b4a9b Branch: refs/heads/master Commit: 2c9b4a9ba96e0b9629403a4f45d6a3fd9a13f82d Parents: c1d70f8 Author: Michael Ho <[email protected]> Authored: Wed Jul 27 01:01:28 2016 -0700 Committer: Internal Jenkins <[email protected]> Committed: Thu Jul 28 06:45:11 2016 +0000 ---------------------------------------------------------------------- be/src/codegen/llvm-codegen.cc | 133 +++++++++++++++++++++++++++++++--- be/src/codegen/llvm-codegen.h | 42 ++++++++--- be/src/util/symbols-util-test.cc | 4 - 3 files changed, 152 insertions(+), 27 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2c9b4a9b/be/src/codegen/llvm-codegen.cc ---------------------------------------------------------------------- diff --git a/be/src/codegen/llvm-codegen.cc b/be/src/codegen/llvm-codegen.cc index c47038e..fb038cd 100644 --- a/be/src/codegen/llvm-codegen.cc +++ b/be/src/codegen/llvm-codegen.cc @@ -57,6 +57,7 @@ #include "codegen/mcjit-mem-mgr.h" #include "impala-ir/impala-ir-names.h" #include "runtime/hdfs-fs-cache.h" +#include "runtime/lib-cache.h" #include "runtime/mem-pool.h" #include "runtime/string-value.h" #include "runtime/timestamp-value.h" @@ -95,12 +96,75 @@ bool LlvmCodeGen::llvm_initialized_ = false; string LlvmCodeGen::cpu_name_; vector<string> LlvmCodeGen::cpu_attrs_; +unordered_set<string> LlvmCodeGen::gv_ref_ir_fns_; static void LlvmCodegenHandleError(void* user_data, const std::string& reason, bool gen_crash_diag) { LOG(FATAL) << "LLVM hit fatal error: " << reason.c_str(); } +bool LlvmCodeGen::IsDefinedInImpalad(const string& fn_name) { + void* fn_ptr = NULL; + Status status = + LibCache::instance()->GetSoFunctionPtr("", fn_name, &fn_ptr, NULL, true); + return status.ok(); +} + +void LlvmCodeGen::ParseGlobalConstant(Value* val, unordered_set<string>* ref_fns) { + // Parse constants to find any referenced functions. + vector<string> fn_names; + if (isa<Function>(val)) { + fn_names.push_back(cast<Function>(val)->getName().str()); + } else if (isa<BlockAddress>(val)) { + const BlockAddress *ba = cast<BlockAddress>(val); + fn_names.push_back(ba->getFunction()->getName().str()); + } else if (isa<GlobalAlias>(val)) { + GlobalAlias* alias = cast<GlobalAlias>(val); + ParseGlobalConstant(alias->getAliasee(), ref_fns); + } else if (isa<ConstantExpr>(val)) { + const ConstantExpr* ce = cast<ConstantExpr>(val); + if (ce->isCast()) { + for (User::const_op_iterator oi=ce->op_begin(); oi != ce->op_end(); ++oi) { + Function* fn = dyn_cast<Function>(*oi); + if (fn != NULL) fn_names.push_back(fn->getName().str()); + } + } + } else if (isa<ConstantStruct>(val) || isa<ConstantArray>(val) || + isa<ConstantDataArray>(val)) { + const Constant* val_constant = cast<Constant>(val); + for (int i = 0; i < val_constant->getNumOperands(); ++i) { + ParseGlobalConstant(val_constant->getOperand(i), ref_fns); + } + } else if (isa<ConstantVector>(val) || isa<ConstantDataVector>(val)) { + const Constant* val_const = cast<Constant>(val); + for (int i = 0; i < val->getType()->getVectorNumElements(); ++i) { + ParseGlobalConstant(val_const->getAggregateElement(i), ref_fns); + } + } else { + // Ignore constants which cannot contain function pointers. Ignore other global + // variables referenced by this global variable as InitializeLlvm() will parse + // all global variables. + DCHECK(isa<UndefValue>(val) || isa<ConstantFP>(val) || isa<ConstantInt>(val) || + isa<GlobalVariable>(val) || isa<ConstantTokenNone>(val) || + isa<ConstantPointerNull>(val) || isa<ConstantAggregateZero>(val) || + isa<ConstantDataSequential>(val)); + } + + // Adds all functions not defined in Impalad native binary. + for (const string& fn_name: fn_names) { + if (!IsDefinedInImpalad(fn_name)) ref_fns->insert(fn_name); + } +} + +void LlvmCodeGen::ParseGVForFunctions(Module* module, unordered_set<string>* ref_fns) { + for (GlobalVariable& gv: module->globals()) { + if (gv.hasInitializer() && gv.isConstant()) { + Constant* val = gv.getInitializer(); + if (val->getNumOperands() > 0) ParseGlobalConstant(val, ref_fns); + } + } +} + void LlvmCodeGen::InitializeLlvm(bool load_backend) { DCHECK(!llvm_initialized_); llvm::remove_fatal_error_handler(); @@ -129,6 +193,11 @@ void LlvmCodeGen::InitializeLlvm(bool load_backend) { // Write an empty map file for perf to find. if (FLAGS_perf_map) CodegenSymbolEmitter::WritePerfMap(); + + ObjectPool init_pool; + scoped_ptr<LlvmCodeGen> init_codegen; + Status status = LlvmCodeGen::CreateFromMemory(&init_pool, "init", &init_codegen); + ParseGVForFunctions(init_codegen->module_, &gv_ref_ir_fns_); } LlvmCodeGen::LlvmCodeGen(ObjectPool* pool, const string& id) : @@ -245,6 +314,21 @@ Status LlvmCodeGen::LinkModule(const string& file) { // The module data layout must match the one selected by the execution engine. new_module->setDataLayout(execution_engine_->getDataLayout()); + // Record all IR functions in 'new_module' referenced by the module's global variables + // if they are not defined in the Impalad native code. They must be materialized to + // avoid linking error. + unordered_set<string> ref_fns; + ParseGVForFunctions(new_module.get(), &ref_fns); + + // Record all the materializable functions in the new module before linking. + // Linking the new module to the main module (i.e. 'module_') may materialize + // functions in the new module. These materialized functions need to be parsed + // to materialize any functions they call in 'module_'. + unordered_set<string> materializable_fns; + for (Function& fn: new_module->functions()) { + if (fn.isMaterializable()) materializable_fns.insert(fn.getName().str()); + } + bool error = Linker::linkModules(*module_, std::move(new_module)); if (error) { stringstream ss; @@ -252,6 +336,23 @@ Status LlvmCodeGen::LinkModule(const string& file) { return Status(ss.str()); } linked_modules_.insert(file); + + for (const string& fn_name: ref_fns) { + Function* fn = module_->getFunction(fn_name); + DCHECK(fn != NULL); + if (fn->isMaterializable()) { + MaterializeFunction(fn); + materializable_fns.erase(fn->getName().str()); + } + } + // Parse materialized functions in the source module and materialize functions it + // references. Do it after linking so LLVM has "merged" functions defined in both + // modules. + for (const string& fn_name: materializable_fns) { + Function* fn = module_->getFunction(fn_name); + DCHECK(fn != NULL); + if (!fn->isMaterializable()) MaterializeCallees(fn); + } return Status::OK(); } @@ -291,6 +392,9 @@ Status LlvmCodeGen::CreateImpalaCodegen( vector<Function*> functions; for (Function& fn: codegen->module_->functions()) { if (fn.isMaterializable()) functions.push_back(&fn); + if (gv_ref_ir_fns_.find(fn.getName()) != gv_ref_ir_fns_.end()) { + codegen->MaterializeFunction(&fn); + } } int parsed_functions = 0; for (int i = 0; i < functions.size(); ++i) { @@ -526,6 +630,22 @@ void LlvmCodeGen::CreateIfElseBlocks(Function* fn, const string& if_name, *else_block = BasicBlock::Create(context(), else_name, fn, insert_before); } +Status LlvmCodeGen::MaterializeCallees(Function* fn) { + for (inst_iterator iter = inst_begin(fn); iter != inst_end(fn); ++iter) { + Instruction* instr = &*iter; + Function* called_fn = NULL; + if (isa<CallInst>(instr)) { + CallInst* call_instr = reinterpret_cast<CallInst*>(instr); + called_fn = call_instr->getCalledFunction(); + } else if (isa<InvokeInst>(instr)) { + InvokeInst* invoke_instr = reinterpret_cast<InvokeInst*>(instr); + called_fn = invoke_instr->getCalledFunction(); + } + if (called_fn != NULL) RETURN_IF_ERROR(MaterializeFunctionHelper(called_fn)); + } + return Status::OK(); +} + Status LlvmCodeGen::MaterializeFunctionHelper(Function *fn) { DCHECK(!is_compiled_); if (fn->isIntrinsic() || !fn->isMaterializable()) return Status::OK(); @@ -538,18 +658,7 @@ Status LlvmCodeGen::MaterializeFunctionHelper(Function *fn) { // Materialized functions are marked as not materializable by LLVM. DCHECK(!fn->isMaterializable()); - for (inst_iterator iter = inst_begin(fn); iter != inst_end(fn); ++iter) { - Instruction* instr = &*iter; - Function* called_fn = NULL; - if (isa<CallInst>(instr)) { - CallInst* call_instr = reinterpret_cast<CallInst*>(instr); - called_fn = call_instr->getCalledFunction(); - } else if (isa<InvokeInst>(instr)) { - InvokeInst* invoke_instr = reinterpret_cast<InvokeInst*>(instr); - called_fn = invoke_instr->getCalledFunction(); - } - if (called_fn != NULL) MaterializeFunctionHelper(called_fn); - } + RETURN_IF_ERROR(MaterializeCallees(fn)); return Status::OK(); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2c9b4a9b/be/src/codegen/llvm-codegen.h ---------------------------------------------------------------------- diff --git a/be/src/codegen/llvm-codegen.h b/be/src/codegen/llvm-codegen.h index adfcc90..fa465fe 100644 --- a/be/src/codegen/llvm-codegen.h +++ b/be/src/codegen/llvm-codegen.h @@ -432,6 +432,21 @@ class LlvmCodeGen { friend class LlvmCodeGenTest; friend class SubExprElimination; + /// Returns true if the function 'fn' is defined in the Impalad native code. + static bool IsDefinedInImpalad(const std::string& fn); + + /// Parses the given global constant recursively and adds functions referenced in it + /// to the set 'ref_fns' if they are not defined in the Impalad native code. These + /// functions need to be materialized to avoid linking error. + static void ParseGlobalConstant(llvm::Value* global_const, + boost::unordered_set<string>* ref_fns); + + /// Parses all the global variables in 'module' and adds any functions referenced by + /// them to the set 'ref_fns' if they are not defined in the Impalad native code. + /// These functions need to be materialized to avoid linking error. + static void ParseGVForFunctions(llvm::Module* module, + boost::unordered_set<string>* ref_fns); + /// Top level codegen object. 'module_id' is used for debugging when outputting the IR. LlvmCodeGen(ObjectPool* pool, const std::string& module_id); @@ -498,12 +513,9 @@ class LlvmCodeGen { static void FindCallSites(llvm::Function* caller, const std::string& target_name, std::vector<llvm::CallInst*>* results); - /// Whether InitializeLlvm() has been called. - static bool llvm_initialized_; - - /// Host CPU name and attributes, filled in by InitializeLlvm(). - static std::string cpu_name_; - static std::vector<std::string> cpu_attrs_; + /// This function parses the function body of the given function 'fn' and materializes + /// any functions called by it. + Status MaterializeCallees(llvm::Function* fn); /// This is the workhorse for materializing function 'fn'. It's invoked by /// MaterializeFunction(). Calls LLVM to materialize 'fn' if it's materializable @@ -531,6 +543,19 @@ class LlvmCodeGen { /// there is error in materializing the module. Status FinalizeLazyMaterialization(); + /// Whether InitializeLlvm() has been called. + static bool llvm_initialized_; + + /// Host CPU name and attributes, filled in by InitializeLlvm(). + static std::string cpu_name_; + static std::vector<std::string> cpu_attrs_; + + /// This set contains names of functions referenced by global variables which aren't + /// defined in the Impalad native code (they may have been inlined by gcc). These + /// functions are always materialized each time the module is loaded to ensure that + /// LLVM can resolve references to them. + static boost::unordered_set<std::string> gv_ref_ir_fns_; + /// ID used for debugging (can be e.g. the fragment instance ID) std::string id_; @@ -587,11 +612,6 @@ class LlvmCodeGen { /// Execution/Jitting engine. std::unique_ptr<llvm::ExecutionEngine> execution_engine_; - /// Keeps track of the external functions that have been included in this module - /// e.g libc functions or non-jitted impala functions. - /// TODO: this should probably be FnPrototype->Functions mapping - std::map<std::string, llvm::Function*> external_functions_; - /// Functions parsed from pre-compiled module. Indexed by ImpalaIR::Function enum std::vector<llvm::Function*> loaded_functions_; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2c9b4a9b/be/src/util/symbols-util-test.cc ---------------------------------------------------------------------- diff --git a/be/src/util/symbols-util-test.cc b/be/src/util/symbols-util-test.cc index ffe6bf0..1a0a129 100644 --- a/be/src/util/symbols-util-test.cc +++ b/be/src/util/symbols-util-test.cc @@ -17,8 +17,6 @@ #include <iostream> #include <gtest/gtest.h> -#include "codegen/llvm-codegen.h" -#include "util/cpu-info.h" #include "util/symbols-util.h" #include "common/names.h" @@ -325,7 +323,5 @@ TEST(SymbolsUtil, ManglingPrepareOrClose) { int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); - impala::CpuInfo::Init(); - impala::LlvmCodeGen::InitializeLlvm(); return RUN_ALL_TESTS(); }
