Repository: incubator-impala
Updated Branches:
  refs/heads/master c1d70f814 -> 2c9b4a9ba


IMPALA-3906: Materialize implicitly referenced IR functions

With lazy materialization of IR functions in the LLVM module,
there is an assumption that functions not referenced by IR
functions used in the query don't have to be materialized
and can have their linkage types changed to being externally
defined. These unmaterialized functions may be referenced by
global variables in the LLVM module and LLVM resolves these
references to their definitions in the native Impalad binary.
These global variables are mostly arrays containing references
to other global constants or boost and Impala functions included
in the cross-compiled code.

When compiling Impalad with optimization, gcc may actually
inline some of the functions which the global variables in
the LLVM modules reference and LLVM may have linking error
if these referenced IR functions are not materialized as it
can no longer find their definitions in the Impalad binary.
This patch fixes the problem by parsing all the global variables
in the LLVM module during Impalad's initialization and recording
all the referenced functions which aren't defined in the Impalad
binary and make sure they are always materialized.

A second problem which this patch fixes is that linking in
external LLVM module (e.g. UDF IR created by a user) may
implicitly materialize some functions in the external module.
Normally, we would expect functions to be materialized
through the GetFunction() interface and LinkModule() seems
to be an exception. This patch updates LinkModule() to also
parse the functions from the external module just like we do
for unmaterialized functions in GetFunction().

Change-Id: I147eb332adc3f272b7a7a4fb4415842432c05f77
Reviewed-on: http://gerrit.cloudera.org:8080/3792
Reviewed-by: Michael Ho <[email protected]>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/2c9b4a9b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/2c9b4a9b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/2c9b4a9b

Branch: refs/heads/master
Commit: 2c9b4a9ba96e0b9629403a4f45d6a3fd9a13f82d
Parents: c1d70f8
Author: Michael Ho <[email protected]>
Authored: Wed Jul 27 01:01:28 2016 -0700
Committer: Internal Jenkins <[email protected]>
Committed: Thu Jul 28 06:45:11 2016 +0000

----------------------------------------------------------------------
 be/src/codegen/llvm-codegen.cc   | 133 +++++++++++++++++++++++++++++++---
 be/src/codegen/llvm-codegen.h    |  42 ++++++++---
 be/src/util/symbols-util-test.cc |   4 -
 3 files changed, 152 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2c9b4a9b/be/src/codegen/llvm-codegen.cc
----------------------------------------------------------------------
diff --git a/be/src/codegen/llvm-codegen.cc b/be/src/codegen/llvm-codegen.cc
index c47038e..fb038cd 100644
--- a/be/src/codegen/llvm-codegen.cc
+++ b/be/src/codegen/llvm-codegen.cc
@@ -57,6 +57,7 @@
 #include "codegen/mcjit-mem-mgr.h"
 #include "impala-ir/impala-ir-names.h"
 #include "runtime/hdfs-fs-cache.h"
+#include "runtime/lib-cache.h"
 #include "runtime/mem-pool.h"
 #include "runtime/string-value.h"
 #include "runtime/timestamp-value.h"
@@ -95,12 +96,75 @@ bool LlvmCodeGen::llvm_initialized_ = false;
 
 string LlvmCodeGen::cpu_name_;
 vector<string> LlvmCodeGen::cpu_attrs_;
+unordered_set<string> LlvmCodeGen::gv_ref_ir_fns_;
 
 static void LlvmCodegenHandleError(void* user_data, const std::string& reason,
     bool gen_crash_diag) {
   LOG(FATAL) << "LLVM hit fatal error: " << reason.c_str();
 }
 
+bool LlvmCodeGen::IsDefinedInImpalad(const string& fn_name) {
+  void* fn_ptr = NULL;
+  Status status =
+      LibCache::instance()->GetSoFunctionPtr("", fn_name, &fn_ptr, NULL, true);
+  return status.ok();
+}
+
+void LlvmCodeGen::ParseGlobalConstant(Value* val, unordered_set<string>* 
ref_fns) {
+  // Parse constants to find any referenced functions.
+  vector<string> fn_names;
+  if (isa<Function>(val)) {
+    fn_names.push_back(cast<Function>(val)->getName().str());
+  } else if (isa<BlockAddress>(val)) {
+    const BlockAddress *ba = cast<BlockAddress>(val);
+    fn_names.push_back(ba->getFunction()->getName().str());
+  } else if (isa<GlobalAlias>(val)) {
+    GlobalAlias* alias = cast<GlobalAlias>(val);
+    ParseGlobalConstant(alias->getAliasee(), ref_fns);
+  } else if (isa<ConstantExpr>(val)) {
+    const ConstantExpr* ce = cast<ConstantExpr>(val);
+    if (ce->isCast()) {
+      for (User::const_op_iterator oi=ce->op_begin(); oi != ce->op_end(); 
++oi) {
+        Function* fn = dyn_cast<Function>(*oi);
+        if (fn != NULL) fn_names.push_back(fn->getName().str());
+      }
+    }
+  } else if (isa<ConstantStruct>(val) || isa<ConstantArray>(val) ||
+      isa<ConstantDataArray>(val)) {
+    const Constant* val_constant = cast<Constant>(val);
+    for (int i = 0; i < val_constant->getNumOperands(); ++i) {
+      ParseGlobalConstant(val_constant->getOperand(i), ref_fns);
+    }
+  } else if (isa<ConstantVector>(val) || isa<ConstantDataVector>(val)) {
+    const Constant* val_const = cast<Constant>(val);
+    for (int i = 0; i < val->getType()->getVectorNumElements(); ++i) {
+      ParseGlobalConstant(val_const->getAggregateElement(i), ref_fns);
+    }
+  } else {
+    // Ignore constants which cannot contain function pointers. Ignore other 
global
+    // variables referenced by this global variable as InitializeLlvm() will 
parse
+    // all global variables.
+    DCHECK(isa<UndefValue>(val) || isa<ConstantFP>(val) || 
isa<ConstantInt>(val) ||
+        isa<GlobalVariable>(val) || isa<ConstantTokenNone>(val) ||
+        isa<ConstantPointerNull>(val) || isa<ConstantAggregateZero>(val) ||
+        isa<ConstantDataSequential>(val));
+  }
+
+  // Adds all functions not defined in Impalad native binary.
+  for (const string& fn_name: fn_names) {
+    if (!IsDefinedInImpalad(fn_name)) ref_fns->insert(fn_name);
+  }
+}
+
+void LlvmCodeGen::ParseGVForFunctions(Module* module, unordered_set<string>* 
ref_fns) {
+  for (GlobalVariable& gv: module->globals()) {
+    if (gv.hasInitializer() && gv.isConstant()) {
+      Constant* val = gv.getInitializer();
+      if (val->getNumOperands() > 0) ParseGlobalConstant(val, ref_fns);
+    }
+  }
+}
+
 void LlvmCodeGen::InitializeLlvm(bool load_backend) {
   DCHECK(!llvm_initialized_);
   llvm::remove_fatal_error_handler();
@@ -129,6 +193,11 @@ void LlvmCodeGen::InitializeLlvm(bool load_backend) {
 
   // Write an empty map file for perf to find.
   if (FLAGS_perf_map) CodegenSymbolEmitter::WritePerfMap();
+
+  ObjectPool init_pool;
+  scoped_ptr<LlvmCodeGen> init_codegen;
+  Status status = LlvmCodeGen::CreateFromMemory(&init_pool, "init", 
&init_codegen);
+  ParseGVForFunctions(init_codegen->module_, &gv_ref_ir_fns_);
 }
 
 LlvmCodeGen::LlvmCodeGen(ObjectPool* pool, const string& id) :
@@ -245,6 +314,21 @@ Status LlvmCodeGen::LinkModule(const string& file) {
   // The module data layout must match the one selected by the execution 
engine.
   new_module->setDataLayout(execution_engine_->getDataLayout());
 
+  // Record all IR functions in 'new_module' referenced by the module's global 
variables
+  // if they are not defined in the Impalad native code. They must be 
materialized to
+  // avoid linking error.
+  unordered_set<string> ref_fns;
+  ParseGVForFunctions(new_module.get(), &ref_fns);
+
+  // Record all the materializable functions in the new module before linking.
+  // Linking the new module to the main module (i.e. 'module_') may materialize
+  // functions in the new module. These materialized functions need to be 
parsed
+  // to materialize any functions they call in 'module_'.
+  unordered_set<string> materializable_fns;
+  for (Function& fn: new_module->functions()) {
+    if (fn.isMaterializable()) materializable_fns.insert(fn.getName().str());
+  }
+
   bool error = Linker::linkModules(*module_, std::move(new_module));
   if (error) {
     stringstream ss;
@@ -252,6 +336,23 @@ Status LlvmCodeGen::LinkModule(const string& file) {
     return Status(ss.str());
   }
   linked_modules_.insert(file);
+
+  for (const string& fn_name: ref_fns) {
+    Function* fn = module_->getFunction(fn_name);
+    DCHECK(fn != NULL);
+    if (fn->isMaterializable()) {
+      MaterializeFunction(fn);
+      materializable_fns.erase(fn->getName().str());
+    }
+  }
+  // Parse materialized functions in the source module and materialize 
functions it
+  // references. Do it after linking so LLVM has "merged" functions defined in 
both
+  // modules.
+  for (const string& fn_name: materializable_fns) {
+    Function* fn = module_->getFunction(fn_name);
+    DCHECK(fn != NULL);
+    if (!fn->isMaterializable()) MaterializeCallees(fn);
+  }
   return Status::OK();
 }
 
@@ -291,6 +392,9 @@ Status LlvmCodeGen::CreateImpalaCodegen(
   vector<Function*> functions;
   for (Function& fn: codegen->module_->functions()) {
     if (fn.isMaterializable()) functions.push_back(&fn);
+    if (gv_ref_ir_fns_.find(fn.getName()) != gv_ref_ir_fns_.end()) {
+      codegen->MaterializeFunction(&fn);
+    }
   }
   int parsed_functions = 0;
   for (int i = 0; i < functions.size(); ++i) {
@@ -526,6 +630,22 @@ void LlvmCodeGen::CreateIfElseBlocks(Function* fn, const 
string& if_name,
   *else_block = BasicBlock::Create(context(), else_name, fn, insert_before);
 }
 
+Status LlvmCodeGen::MaterializeCallees(Function* fn) {
+  for (inst_iterator iter = inst_begin(fn); iter != inst_end(fn); ++iter) {
+    Instruction* instr = &*iter;
+    Function* called_fn = NULL;
+    if (isa<CallInst>(instr)) {
+      CallInst* call_instr = reinterpret_cast<CallInst*>(instr);
+      called_fn = call_instr->getCalledFunction();
+    } else if (isa<InvokeInst>(instr)) {
+      InvokeInst* invoke_instr = reinterpret_cast<InvokeInst*>(instr);
+      called_fn = invoke_instr->getCalledFunction();
+    }
+    if (called_fn != NULL) 
RETURN_IF_ERROR(MaterializeFunctionHelper(called_fn));
+  }
+  return Status::OK();
+}
+
 Status LlvmCodeGen::MaterializeFunctionHelper(Function *fn) {
   DCHECK(!is_compiled_);
   if (fn->isIntrinsic() || !fn->isMaterializable()) return Status::OK();
@@ -538,18 +658,7 @@ Status LlvmCodeGen::MaterializeFunctionHelper(Function 
*fn) {
 
   // Materialized functions are marked as not materializable by LLVM.
   DCHECK(!fn->isMaterializable());
-  for (inst_iterator iter = inst_begin(fn); iter != inst_end(fn); ++iter) {
-    Instruction* instr = &*iter;
-    Function* called_fn = NULL;
-    if (isa<CallInst>(instr)) {
-      CallInst* call_instr = reinterpret_cast<CallInst*>(instr);
-      called_fn = call_instr->getCalledFunction();
-    } else if (isa<InvokeInst>(instr)) {
-      InvokeInst* invoke_instr = reinterpret_cast<InvokeInst*>(instr);
-      called_fn = invoke_instr->getCalledFunction();
-    }
-    if (called_fn != NULL) MaterializeFunctionHelper(called_fn);
-  }
+  RETURN_IF_ERROR(MaterializeCallees(fn));
   return Status::OK();
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2c9b4a9b/be/src/codegen/llvm-codegen.h
----------------------------------------------------------------------
diff --git a/be/src/codegen/llvm-codegen.h b/be/src/codegen/llvm-codegen.h
index adfcc90..fa465fe 100644
--- a/be/src/codegen/llvm-codegen.h
+++ b/be/src/codegen/llvm-codegen.h
@@ -432,6 +432,21 @@ class LlvmCodeGen {
   friend class LlvmCodeGenTest;
   friend class SubExprElimination;
 
+  /// Returns true if the function 'fn' is defined in the Impalad native code.
+  static bool IsDefinedInImpalad(const std::string& fn);
+
+  /// Parses the given global constant recursively and adds functions 
referenced in it
+  /// to the set 'ref_fns' if they are not defined in the Impalad native code. 
These
+  /// functions need to be materialized to avoid linking error.
+  static void ParseGlobalConstant(llvm::Value* global_const,
+      boost::unordered_set<string>* ref_fns);
+
+  /// Parses all the global variables in 'module' and adds any functions 
referenced by
+  /// them to the set 'ref_fns' if they are not defined in the Impalad native 
code.
+  /// These functions need to be materialized to avoid linking error.
+  static void ParseGVForFunctions(llvm::Module* module,
+      boost::unordered_set<string>* ref_fns);
+
   /// Top level codegen object.  'module_id' is used for debugging when 
outputting the IR.
   LlvmCodeGen(ObjectPool* pool, const std::string& module_id);
 
@@ -498,12 +513,9 @@ class LlvmCodeGen {
   static void FindCallSites(llvm::Function* caller, const std::string& 
target_name,
       std::vector<llvm::CallInst*>* results);
 
-  /// Whether InitializeLlvm() has been called.
-  static bool llvm_initialized_;
-
-  /// Host CPU name and attributes, filled in by InitializeLlvm().
-  static std::string cpu_name_;
-  static std::vector<std::string> cpu_attrs_;
+  /// This function parses the function body of the given function 'fn' and 
materializes
+  /// any functions called by it.
+  Status MaterializeCallees(llvm::Function* fn);
 
   /// This is the workhorse for materializing function 'fn'. It's invoked by
   /// MaterializeFunction(). Calls LLVM to materialize 'fn' if it's 
materializable
@@ -531,6 +543,19 @@ class LlvmCodeGen {
   /// there is error in materializing the module.
   Status FinalizeLazyMaterialization();
 
+  /// Whether InitializeLlvm() has been called.
+  static bool llvm_initialized_;
+
+  /// Host CPU name and attributes, filled in by InitializeLlvm().
+  static std::string cpu_name_;
+  static std::vector<std::string> cpu_attrs_;
+
+  /// This set contains names of functions referenced by global variables 
which aren't
+  /// defined in the Impalad native code (they may have been inlined by gcc). 
These
+  /// functions are always materialized each time the module is loaded to 
ensure that
+  /// LLVM can resolve references to them.
+  static boost::unordered_set<std::string> gv_ref_ir_fns_;
+
   /// ID used for debugging (can be e.g. the fragment instance ID)
   std::string id_;
 
@@ -587,11 +612,6 @@ class LlvmCodeGen {
   /// Execution/Jitting engine.
   std::unique_ptr<llvm::ExecutionEngine> execution_engine_;
 
-  /// Keeps track of the external functions that have been included in this 
module
-  /// e.g libc functions or non-jitted impala functions.
-  /// TODO: this should probably be FnPrototype->Functions mapping
-  std::map<std::string, llvm::Function*> external_functions_;
-
   /// Functions parsed from pre-compiled module.  Indexed by 
ImpalaIR::Function enum
   std::vector<llvm::Function*> loaded_functions_;
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2c9b4a9b/be/src/util/symbols-util-test.cc
----------------------------------------------------------------------
diff --git a/be/src/util/symbols-util-test.cc b/be/src/util/symbols-util-test.cc
index ffe6bf0..1a0a129 100644
--- a/be/src/util/symbols-util-test.cc
+++ b/be/src/util/symbols-util-test.cc
@@ -17,8 +17,6 @@
 #include <iostream>
 #include <gtest/gtest.h>
 
-#include "codegen/llvm-codegen.h"
-#include "util/cpu-info.h"
 #include "util/symbols-util.h"
 
 #include "common/names.h"
@@ -325,7 +323,5 @@ TEST(SymbolsUtil, ManglingPrepareOrClose) {
 
 int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  impala::CpuInfo::Init();
-  impala::LlvmCodeGen::InitializeLlvm();
   return RUN_ALL_TESTS();
 }

Reply via email to