>From 94f503013b448de1747e51e7fb62259f3917e65f Mon Sep 17 00:00:00 2001
From: Yin Ma <yinma@codeaurora.org>
Date: Fri, 2 May 2014 17:50:11 -0700
Subject: [PATCH] Greedy LLVM Inliner

---
 include/llvm/InitializePasses.h            |    2 +
 include/llvm/LinkAllPasses.h               |    2 +
 include/llvm/Transforms/IPO.h              |   11 +
 include/llvm/Transforms/IPO/InlinerPass.h  |   13 +
 lib/Transforms/IPO/CMakeLists.txt          |    2 +
 lib/Transforms/IPO/GreedyInliner.cpp       | 1056 ++++++++++++++++++++++++++++
 lib/Transforms/IPO/GreedyInlinerHelper.cpp |   85 +++
 lib/Transforms/IPO/Inliner.cpp             |   13 +-
 lib/Transforms/IPO/PassManagerBuilder.cpp  |   26 +-
 tools/opt/opt.cpp                          |   24 +-
 10 files changed, 1223 insertions(+), 11 deletions(-)
 create mode 100644 lib/Transforms/IPO/GreedyInliner.cpp
 create mode 100644 lib/Transforms/IPO/GreedyInlinerHelper.cpp

diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 20074f0..8a211df 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -135,6 +135,8 @@ void initializeGVNPass(PassRegistry&);
 void initializeGlobalDCEPass(PassRegistry&);
 void initializeGlobalOptPass(PassRegistry&);
 void initializeGlobalsModRefPass(PassRegistry&);
+void initializeGreedyInlinerPass(PassRegistry&);
+void initializeGreedyInlinerHelperPass(PassRegistry&);
 void initializeIPCPPass(PassRegistry&);
 void initializeIPSCCPPass(PassRegistry&);
 void initializeIVUsersPass(PassRegistry&);
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index b7f832d..022833f 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -149,6 +149,8 @@ namespace {
       (void) llvm::createPrintBasicBlockPass(*(llvm::raw_ostream*)nullptr);
       (void) llvm::createModuleDebugInfoPrinterPass();
       (void) llvm::createPartialInliningPass();
+      (void) llvm::createGreedyInliningPass();
+      (void) llvm::createGreedyInlinerHelperPass();
       (void) llvm::createLintPass();
       (void) llvm::createSinkingPass();
       (void) llvm::createLowerAtomicPass();
diff --git a/include/llvm/Transforms/IPO.h b/include/llvm/Transforms/IPO.h
index ce1a7d6..e373d9a 100644
--- a/include/llvm/Transforms/IPO.h
+++ b/include/llvm/Transforms/IPO.h
@@ -190,6 +190,17 @@ ModulePass *createMergeFunctionsPass();
 ModulePass *createPartialInliningPass();
 
 //===----------------------------------------------------------------------===//
+/// createGreedyInliningPass - This pass inlines functions with greedy
+/// algorithm.
+///
+Pass *createGreedyInliningPass();
+Pass *createGreedyInliningPass(int OptLevel);
+Pass *createGreedyInliningPass(int Threshold, int OptLevel);
+Pass *createGreedyInlinerHelperPass();
+Pass *createGreedyInlinerHelperPass(int Threshold);
+Pass *createGreedyInlinerHelperPass(int Threshold, int OptLevel);
+
+//===----------------------------------------------------------------------===//
 // createMetaRenamerPass - Rename everything with metasyntatic names.
 //
 ModulePass *createMetaRenamerPass();
diff --git a/include/llvm/Transforms/IPO/InlinerPass.h b/include/llvm/Transforms/IPO/InlinerPass.h
index 6a644ad..992046f 100644
--- a/include/llvm/Transforms/IPO/InlinerPass.h
+++ b/include/llvm/Transforms/IPO/InlinerPass.h
@@ -18,6 +18,7 @@
 #define LLVM_TRANSFORMS_IPO_INLINERPASS_H
 
 #include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/ADT/SmallVector.h"
 
 namespace llvm {
   class CallSite;
@@ -74,6 +75,12 @@ struct Inliner : public CallGraphSCCPass {
   /// deal with that subset of the functions.
   bool removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly = false);
 
+  /// Interface for Greedy Inliner
+  void setPreferredCallSite(CallSite CS) { PreferredCS = CS; }
+  void setBonusThreshold(int Bonus) { BonusThreshold = Bonus; }
+  int getBonusThreshold() { return BonusThreshold; }
+  SmallVector<WeakVH, 8>& getInlinedCalls() { return InlinedCalls; }
+
 private:
   // InlineThreshold - Cache the value here for easy access.
   unsigned InlineThreshold;
@@ -84,6 +91,12 @@ private:
   /// shouldInline - Return true if the inliner should attempt to
   /// inline at the given CallSite.
   bool shouldInline(CallSite CS);
+
+  /// Interface to Greedy Inliner
+  CallSite PreferredCS;
+  int BonusThreshold;
+  SmallVector<WeakVH, 8> InlinedCalls;
+
 };
 
 } // End llvm namespace
diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt
index 90c1c33..955d095 100644
--- a/lib/Transforms/IPO/CMakeLists.txt
+++ b/lib/Transforms/IPO/CMakeLists.txt
@@ -12,6 +12,8 @@ add_llvm_library(LLVMipo
   InlineAlways.cpp
   InlineSimple.cpp
   Inliner.cpp
+  GreedyInliner.cpp
+  GreedyInlinerHelper.cpp
   Internalize.cpp
   LoopExtractor.cpp
   MergeFunctions.cpp
diff --git a/lib/Transforms/IPO/GreedyInliner.cpp b/lib/Transforms/IPO/GreedyInliner.cpp
new file mode 100644
index 0000000..eea7c17
--- /dev/null
+++ b/lib/Transforms/IPO/GreedyInliner.cpp
@@ -0,0 +1,1056 @@
+//===- GreedyInliner.cpp - A module inliner pass       --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a module inliner pass with greedy callsite queue. This
+// Inliner reuses the original SCC inliner to do local decision and actual
+// inlining work.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "greedyinliner"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Transforms/IPO/InlinerPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/IR/IntrinsicInst.h"
+
+
+using namespace llvm;
+
+STATISTIC(NumInlined, "Number of functions inlined");
+STATISTIC(NumCallSiteProcessed, "Number of call sites processed");
+STATISTIC(NumDeleted, "Number of functions deleted because all callers found");
+
+cl::opt<bool>
+EnableGreedyInliner("greedy-inliner", cl::init(false), cl::Hidden,
+                    cl::ZeroOrMore,
+             cl::desc("Enable module inliner pass with greedy priority queue"));
+
+cl::opt<double>
+  InlineWeightThreshold("inline-weight-threshold", cl::init(0), cl::Hidden,
+   cl::ZeroOrMore,
+   cl::desc("Inline any call site with its weight larger than this threshold"));
+
+cl::opt<double>
+  InlineGrowthRateThreshold("inline-module-growth", cl::init(0), cl::Hidden,
+      cl::ZeroOrMore,
+      cl::desc("The maximum size of module growth in code size in percentage"));
+
+cl::opt<bool>
+  InlinePerfMode("inline-perf-mode", cl::init(false), cl::Hidden,
+      cl::ZeroOrMore,
+      cl::desc("Set inliner into performance mode with -O3 and above."));
+
+cl::opt<bool>
+  InlineDebugWeight("inline-debug-print-weight", cl::init(false), cl::Hidden,
+      cl::ZeroOrMore,
+      cl::desc("Print weight computation in debugging."));
+
+extern cl::opt<bool> LTOFileEmit;
+extern cl::opt<int> HintThreshold;
+
+namespace {
+
+  typedef SmallVector<std::pair<CallSite,
+                 std::pair<double, std::vector<Function*>>>, 16> CallSiteVector;
+  typedef SmallVectorImpl<std::pair<CallSite,
+                 std::pair<double, std::vector<Function*>>>> CallSiteVectorImpl;
+
+  class GreedyInliner : public ModulePass {
+  public:
+    static char ID;
+    GreedyInliner() : ModulePass(ID) {
+      initializeGreedyInlinerPass(*PassRegistry::getPassRegistry());
+      initializeDefaultSetting(275, 3);
+    }
+
+    GreedyInliner(int CurOptLevel) : ModulePass(ID) {
+      initializeGreedyInlinerPass(*PassRegistry::getPassRegistry());
+      initializeDefaultSetting(275, CurOptLevel);
+    }
+
+    GreedyInliner(int Threshold, int CurOptLevel) : ModulePass(ID) {
+      initializeGreedyInlinerPass(*PassRegistry::getPassRegistry());
+      initializeDefaultSetting(Threshold, CurOptLevel);
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LoopInfo>();
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<CallGraphWrapperPass>();
+      AU.addRequired<TargetTransformInfo>();
+      AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+    }
+
+    bool runOnModule(Module &M);
+
+  protected:
+    double ComputeCallSiteWeight(CallSite& CS);
+    int ComputeCallSiteWeightBenefit(CallSite& CS);
+    unsigned CollectFunctionCallSites(Function* F, CallSiteVector& CallSites,
+                              unsigned long& BBCount, unsigned long& InstCount);
+    void ComputeCallSitesWeight(CallSiteVector& CallSites);
+    CallSite GetBestCallSite(CallSiteVector& CallSites,
+                             std::vector<Function*>& InlineHistory);
+    void CollectInvalidCallSites(Function* Caller, CallSiteVector& CallSites,
+                                 SmallVector<CallSite, 16>& InvalidCallSites);
+    void CleanInvalidCallSites(CallSiteVector& CallSites,
+                               SmallVector<CallSite, 16>& InvalidCallSites);
+    void UpdateCallSitesAfterInlining(Function* Caller, Function* Callee,
+                                      SmallVector<WeakVH, 8>& InlinedCalls,
+                                      CallSiteVector& CallSites,
+                                      std::vector<Function*>& InlineHistory);
+    unsigned long ComputeFunctionSize(Function* F);
+    int ComputeBonusThreshold(CallSite& CS);
+    int GetBranchLevel(Function* Caller, Instruction* I);
+    void initializeDefaultSetting(int Threshold, int CurOptLevel);
+    void initializeCSBonusSetting(int Threshold);
+    bool isPreferredCS(CallSite& CS);
+
+    bool RunInliner(CallSite CS);
+
+    const TargetLibraryInfo *TLI;
+    InlineCostAnalysis *ICA;
+    Inliner* FuncInliner;
+    unsigned OptLevel;
+    int OverallThreshold;
+    std::map<Function*, unsigned long> FunctionSizes;
+    std::map<Function*, unsigned long> FunctionCallCount;
+    unsigned long TotalSize;
+    long SizeDelta;
+
+    // Function Size Computation.
+    // InstrNum + BasicBlockSizeCost * BlockNum.
+    unsigned BasicBlockSizeCost;
+
+    // Bonus is provided to SCC based original inliner. This value is
+    // added on the top of threshold. If you want to inline or not inline
+    // a callsite with a certain feature. You can give bonus to this
+    // feature.
+
+    // Bonus if a callee has an array alloca.
+    int BonusArrayAlloca;
+    // Bonus if a callee has a static alloca.
+    int BonusStaticAlloca;
+    // Bonus if the callee is linkedonce linkage and only has one use.
+    int BonusLinkOnceOneUse;
+    // Bonus if the callee is linkedonce linkage and only has two uses.
+    int BonusLinkOnceTwoUse;
+    // Bonus if the callee is linkedonce linkage and only has two uses.
+    int BonusLinkOnceManyUse;
+    // Bonus if the callee is local linkage and only has two uses.
+    int BonusLocalTwoUse;
+    // Bonus used to reverse call penalty in SCC inliner cost computation.
+    int BonusReverseCallPenalty;
+    // Apply SameCall Penalty if above this density.
+    double BonusSameCallDensity;
+    // Consider SameCall Penalty if SameCall is more than the lower bound.
+    int BonusSameCallFunctionLowerBound;
+    // Consider SameCall Penalty if SameCall is more than the lower bound.
+    int BonusSameCallBlockLowerBound;
+
+    // Benefit is used in computing a weight of a callsite. if you want to
+    // process a call site first over other callsites, you need give it
+    // larger benefit.
+
+    // Benefit if an argument is constant.
+    int BenefitConstantArgument;
+    // Benefit if an argument is not constant.
+    int BenefitVariableArgument;
+    // Benefit if callee has an array alloca.
+    int BenefitArrayAlloca;
+    // Benefit if callee has a static alloca.
+    int BenefitStaticAlloca;
+    // Benefit if callee has alwaysinline attribute
+    int BenefitAlwaysInline;
+
+    // Factor is the scale used in weight computation. It is used for
+    // increasing or decreasing the important of an element in the
+    // equation.
+
+    // Factor for LoopDepthBonus.
+    int LoopDepthBonusFactor;
+    // Factor for the use count of this callee if there is only one use.
+    int OneUseBonusFactor;
+    // Factor for the use count of this callee if there is only two uses.
+    int TwoUseBonusFactor;
+    // Factor for the whole weight that makes weight threshold more effective.
+    int WeightScaleFactor;
+    // Enable it to make calls in branch has lower weight in the same caller.
+    bool ApplyBranchLevel;
+
+  };
+}
+
+char GreedyInliner::ID = 0;
+INITIALIZE_PASS_BEGIN(GreedyInliner, "greedy-inliner", "Greedy Inliner",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(FunctionAttrs)
+INITIALIZE_PASS_END(GreedyInliner, "greedy-inliner", "Greedy Inliner",
+                    false, false)
+
+
+Pass* llvm::createGreedyInliningPass() { return new GreedyInliner(); }
+
+Pass* llvm::createGreedyInliningPass(int OptLevel) {
+  return new GreedyInliner(OptLevel);
+}
+
+Pass* llvm::createGreedyInliningPass(int Threshold, int OptLevel) {
+  return new GreedyInliner(Threshold, OptLevel);
+}
+
+/// initializeDefaultSetting - Initialize control values.
+void GreedyInliner::initializeDefaultSetting(int Threshold, int CurOptLevel) {
+
+  // Global Settings.
+  ICA = NULL;
+  FuncInliner = NULL;
+  OptLevel = CurOptLevel;
+
+  // Threshold Settings.
+  OverallThreshold = Threshold;
+  // For -Os, greedy inliner can use less overall threshold to achieve
+  // the similar performance.
+  if (OptLevel <= 2 && Threshold > 35)
+      OverallThreshold = 35;
+  // If we are in LTO individual file compilation phrase, we should only
+  // do very basic inlining and leave the major job in whole file
+  // compilation phrase.
+  if (LTOFileEmit)
+    OverallThreshold = 15;
+
+  // Common Settings.
+  BasicBlockSizeCost = 4;
+  WeightScaleFactor = 100;
+  BenefitAlwaysInline = 10000;
+
+  BenefitArrayAlloca = Threshold / 6;
+  BenefitStaticAlloca = Threshold / 30;
+  BenefitConstantArgument = Threshold / 15;
+  BenefitVariableArgument = Threshold / 300;
+
+  if (OptLevel > 2 && !InlinePerfMode) {
+    // Balanced Mode
+    LoopDepthBonusFactor = 10;
+    OneUseBonusFactor = 10;
+    TwoUseBonusFactor = 1;
+
+    ApplyBranchLevel = true;
+  } else if (OptLevel > 2 && InlinePerfMode) {
+    // Performance Mode
+    LoopDepthBonusFactor = 10;
+    OneUseBonusFactor = 10;
+    TwoUseBonusFactor = 2;
+
+    ApplyBranchLevel = false;
+  } else {
+    // CodeSize Mode
+    BenefitConstantArgument = Threshold / 6;
+    BenefitVariableArgument = Threshold / 30;
+
+    LoopDepthBonusFactor = 0;
+    OneUseBonusFactor = 100;
+    TwoUseBonusFactor = 1;
+
+    ApplyBranchLevel = true;
+
+    if (!InlineGrowthRateThreshold.getNumOccurrences())
+      InlineGrowthRateThreshold = 400;
+  }
+
+  initializeCSBonusSetting(Threshold);
+}
+
+/// initializeCSBonusSetting - Initialize control values for bonus computing.
+void GreedyInliner::initializeCSBonusSetting(int Threshold) {
+  // Common portion.
+  BonusLinkOnceOneUse = 0;
+  BonusLinkOnceTwoUse = 0;
+  BonusLinkOnceManyUse = 0;
+  BonusLocalTwoUse = 0;
+
+  BonusSameCallDensity = 0.3;
+  BonusSameCallBlockLowerBound = 4;
+  BonusSameCallFunctionLowerBound = 10;
+
+  BonusReverseCallPenalty = InlineConstants::CallPenalty;
+
+  if (OptLevel > 2 && !InlinePerfMode) {
+    // Balanced Mode
+    BonusArrayAlloca = Threshold * 0.33;
+    BonusStaticAlloca = Threshold * 0.03;
+
+    BonusLinkOnceOneUse = 20;
+    BonusLocalTwoUse = Threshold;
+  } else if (OptLevel > 2 && InlinePerfMode) {
+    // Performance Mode
+    BonusArrayAlloca = Threshold * 1.33;
+    BonusStaticAlloca = Threshold * 0.33;
+
+    BonusLinkOnceOneUse = -InlineConstants::LastCallToStaticBonus;
+    BonusLocalTwoUse = Threshold * 2;
+    BonusLinkOnceTwoUse = Threshold * 2;
+  } else {
+    // CodeSize Mode
+    BonusArrayAlloca = Threshold * 1.33;
+    BonusStaticAlloca = Threshold * 0.33;
+
+    BonusSameCallFunctionLowerBound = 6;
+
+    // The default value is too small for NonLocalLinkage
+    if (OverallThreshold > 20) {
+      BonusLinkOnceOneUse = OverallThreshold * 1.5 - 20;
+      BonusLinkOnceTwoUse = OverallThreshold - 20;
+      BonusLinkOnceManyUse = BonusLinkOnceTwoUse;
+    }
+
+    // The default value is too big for InlineHint
+    if (HintThreshold.getNumOccurrences() == 0 &&
+        HintThreshold >  OverallThreshold)
+        HintThreshold = OverallThreshold;
+  }
+}
+
+/// runOnModule - The driver of this greedy inliner. This inliner builds
+/// a global queue of all valid call sites with a weight in the entire module.
+/// Then it uses a greedy algorithm to inline the call site with the current
+/// best weight one by one until thresholds reach. This inliner drives the
+/// original llvm inliner to do local decision and actual inlining job.
+bool GreedyInliner::runOnModule(Module& M) {
+
+  TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+
+  if (!ICA)
+    ICA = new InlineCostAnalysis();
+
+  bool Changed = false;
+
+  CallSiteVector CallSites;
+  TotalSize = 0;
+
+  // Collect all call sites and compute function size.
+  for (Module::iterator it = M.begin(); it != M.end(); ++it) {
+    Function* F = &*it;
+    if (F->isDeclaration())
+      continue;
+
+    unsigned long BBCount;
+    unsigned long InstCount;
+    FunctionCallCount[F] = CollectFunctionCallSites(F, CallSites, BBCount,
+                                                    InstCount);
+    unsigned long FS = InstCount + BasicBlockSizeCost * BBCount;
+    FunctionSizes[F] = FS;
+
+    // We only inline linkonce functions into other functions.
+    // And not count it for module size growth.
+    if (!F->hasLinkOnceLinkage())
+      TotalSize += FS;
+  }
+
+  // Compute weight for each call site since FunctionSizes is complete.
+  ComputeCallSitesWeight(CallSites);
+
+  // Do inlining loop.
+  FuncInliner = (Inliner*)
+                llvm::createGreedyInlinerHelperPass(OverallThreshold);
+
+  FuncInliner->setResolver(getResolver());
+
+  SizeDelta = 0;
+  while (CallSites.size() > 0) {
+    // FIXME: we may use std::set for InlineHistory to utilize binary search.
+    std::vector<Function*> InlineHistory;
+    CallSite CS = GetBestCallSite(CallSites, InlineHistory);
+    NumCallSiteProcessed ++;
+    Function* Caller = CS.getCaller();
+    Function* Callee = CS.getCalledFunction();
+
+    DEBUG(dbgs() << "#" << NumCallSiteProcessed << " "
+                 << Caller->getName()
+                 << "(" << Caller->getNumUses() << ") "
+                 << " <- " << Callee->getName()
+                 << "(" << Callee->getNumUses() << ")");
+
+    // Prepare if Callee is deleted, the unprocessed CS become invalid.
+    SmallVector<CallSite, 16> InvalidCallSites;
+    if (Callee->hasOneUse() && Callee->isDiscardableIfUnused()) {
+      CollectInvalidCallSites(Callee, CallSites, InvalidCallSites);
+    }
+
+    int BonusThreshold = ComputeBonusThreshold(CS);
+
+    CallGraphNode Node(Caller);
+    std::vector<CallGraphNode*> NodeVec;
+    NodeVec.push_back(&Node);
+    CallGraphSCC  SCC(NULL);
+    SCC.initialize(&NodeVec[0], &NodeVec[0]+NodeVec.size());
+    FuncInliner->setPreferredCallSite(CS);
+    FuncInliner->setBonusThreshold(BonusThreshold);
+    bool LocalChanged = FuncInliner->runOnSCC(SCC);
+    Changed |= LocalChanged;
+
+    DEBUG(if (LocalChanged) dbgs() << "\t\tInlined\n";
+            else dbgs() << "\t\tNOT Inlined \n");
+
+    if (!LocalChanged)
+      continue;
+
+    NumInlined++;
+    SmallVector<WeakVH, 8>& InlinedCalls = FuncInliner->getInlinedCalls();
+
+    // Update Growth Rate.
+    unsigned long SizeBefore = FunctionSizes[Caller];
+    unsigned long SizeAfter = ComputeFunctionSize(Caller);
+    FunctionSizes[Caller] = SizeAfter;
+    if (!Caller->hasLinkOnceLinkage())
+      SizeDelta += SizeAfter - SizeBefore;
+
+    // Check if callee has been removed.
+    if (Callee->use_empty() && Callee->isDiscardableIfUnused()) {
+        CleanInvalidCallSites(CallSites, InvalidCallSites);
+        if (Callee->hasLocalLinkage())
+          SizeDelta -= FunctionSizes[Callee];
+        NumDeleted++;
+    }
+
+    // Check Growth Rate Threshold.
+    double GrowthRate = (double)(SizeDelta + TotalSize)/(double)TotalSize;
+    if (InlineGrowthRateThreshold > 0 &&
+        GrowthRate > (InlineGrowthRateThreshold/100)) {
+      DEBUG(dbgs() << "Greedy Inliner stopped due to growth rate :"
+                   << GrowthRate << "\n");
+      break;
+    }
+
+    InlineHistory.push_back(Callee);
+    // Update CallSites after inlining.
+    UpdateCallSitesAfterInlining(Caller, Callee, InlinedCalls, CallSites,
+                                 InlineHistory);
+  }
+
+  return Changed;
+}
+
+bool GreedyInliner::isPreferredCS(CallSite& CS)
+{
+  if (!LTOFileEmit)
+    return true;
+
+  Function* Callee = CS.getCalledFunction();
+  // We tried to look for those cases that always have benefit to inline
+  // 1. local linkage with only one use
+  // 2. body smaller than number of arguments
+  // 3. very small functions
+  if (Callee->hasLocalLinkage() && Callee->hasOneUse())
+    return true;
+
+  int ArgNum = 0;
+  for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+        I != E; ++I) {
+      ArgNum ++;
+  }
+
+  unsigned long CallerSize = FunctionSizes[Callee];
+  if (CallerSize <= (unsigned long)(ArgNum + 4))
+    return true;
+
+  if (CallerSize <= (unsigned long)OverallThreshold)
+    return true;
+
+  return false;
+}
+
+/// CollectFunctionCallSites - Traverse the given function and put valid
+/// call site candidate into call site queue.
+unsigned GreedyInliner::CollectFunctionCallSites(Function* F,
+                                                 CallSiteVector& CallSites,
+                             unsigned long& BBCount, unsigned long& InstCount) {
+  unsigned CSCount = 0;
+  BBCount = 0;
+  InstCount = 0;
+  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+    BBCount ++;
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+      if (isa<DbgInfoIntrinsic>(I)) {
+        I ++;
+        continue;
+      }
+
+      InstCount ++;
+      CallSite CS(cast<Value>(I));
+      // If this isn't a call, or it is a call to an intrinsic, it can
+      // never be inlined.
+      if (!CS || isa<IntrinsicInst>(I)) {
+        I ++;
+        continue;
+      }
+
+      // Remove dead call based on computation from FunctionAttrs pass
+      if (isInstructionTriviallyDead(I, TLI)) {
+        DEBUG(dbgs() << "Deleting dead call: " << *I << "\n");
+        BasicBlock::iterator OI = I;
+        I ++;
+        NumDeleted ++;
+        OI->eraseFromParent();
+        continue;
+      }
+
+      // No indirect call.
+      if (CS.getCalledFunction() == NULL) {
+        I ++;
+        continue;
+      }
+
+      // Only declaration.
+      if (CS.getCalledFunction()->isDeclaration()) {
+        I ++;
+        continue;
+      }
+
+      // No recursive call.
+      if (CS.getCalledFunction() == F) {
+        I ++;
+        continue;
+      }
+
+      if (!isPreferredCS(CS)) {
+        DEBUG(dbgs() << "Skipped Non-Preferred Call: " << *I << "\n");
+        I++;
+        continue;
+      }
+
+      CSCount ++;
+      CallSites.push_back(std::make_pair(CS,
+                                std::make_pair(0.0, std::vector<Function*>())));
+
+      I ++;
+    }
+  }
+  return CSCount;
+}
+
+/// ComputeCallSitesWeight - Compute weight for each call site in call site
+/// queue.
+void GreedyInliner::ComputeCallSitesWeight(CallSiteVector& CallSites) {
+  for (CallSiteVectorImpl::iterator I =
+      CallSites.begin(); I != CallSites.end(); ) {
+    CallSite& CS = I->first;
+    double Weight = ComputeCallSiteWeight(CS);
+    if (Weight <= InlineWeightThreshold) {
+      // Remove it.
+      DEBUG(dbgs() << "Skip This CallSite " << CS.getCaller()->getName()
+                   << " << " << CS.getCalledFunction()->getName()
+                   << " [" << Weight << "]\n");
+      I = CallSites.erase(I);
+      continue;
+    }
+
+    I->second.first = Weight;
+    // Continue.
+    ++I;
+  }
+}
+
+/// ComputeBonusThreshold - Compute extra threshold that will add to the
+/// original inliner threshold to make original inliner works better
+int GreedyInliner::ComputeBonusThreshold(CallSite& CS) {
+
+    unsigned CSThreshold = FuncInliner->getInlineThreshold(CS);
+    initializeCSBonusSetting(CSThreshold);
+
+    Function* Caller = CS.getCaller();
+    Function* Callee = CS.getCalledFunction();
+
+    int Bonus = 0;
+
+    // Various use relative bonuses.
+    // Giving this bonus will increase code size. Only local linkage with one
+    // use can reduce code size always.
+    if (Callee->hasLinkOnceLinkage()) {
+       switch(Callee->getNumUses()) {
+         case 1:
+           Bonus += BonusLinkOnceOneUse;
+         break;
+         case 2:
+           Bonus += BonusLinkOnceTwoUse;
+         break;
+         default:
+           Bonus += BonusLinkOnceManyUse;
+       }
+    }else if (Callee->hasLocalLinkage() && Callee->getNumUses() == 2)
+       Bonus += BonusLocalTwoUse;
+
+    // Alloca Bonus.
+    Function *F = Callee;
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+        if (isa<AllocaInst>(I)) {
+          AllocaInst* AI = cast<AllocaInst>(I);
+          if (AI->isArrayAllocation()) {
+            Bonus += BonusArrayAlloca;
+          }else if (AI->isStaticAlloca()) {
+            Bonus += BonusStaticAlloca;
+          }
+        }
+      }
+    }
+
+    // Switch same call penalty.
+    int NumCalls = 0;
+    for (Value::user_iterator UI = Callee->user_begin(),
+         UE = Callee->user_end(); UI != UE; ++UI) {
+      CallSite Call(*UI);
+      if (!Call || Call.getCalledFunction() != F)
+        continue;
+      if (Call.getCaller() != Caller)
+        continue;
+      ++NumCalls;
+    }
+
+    BasicBlock* BB = CS.getInstruction()->getParent();
+    SwitchInst *SI = NULL;
+    DominatorTreeWrapperPass* DTWP =
+                                &getAnalysis<DominatorTreeWrapperPass>(*Caller);
+    DominatorTree& DT = DTWP->getDomTree();
+    int IC = 0;
+    DomTreeNode* DN = NULL;
+    if (DT.getNode(BB))
+      DN = DT.getNode(BB)->getIDom();
+    while (DN) {
+      BasicBlock* StmtBB = DN->getBlock();
+      TerminatorInst* I = StmtBB->getTerminator();
+
+      if (const BranchInst *BI = dyn_cast<BranchInst>(I)) {
+        if (BI->isConditional()) {
+          IC ++;
+        }
+      }else if ((SI = dyn_cast<SwitchInst>(I))) {
+        break;
+      }
+
+      DN = DN->getIDom();
+    }
+    bool SwitchPaneltyApplied = false;
+    if (SI && NumCalls > 1) {
+      unsigned NC = SI->getNumCases();
+      Bonus -= CSThreshold - CSThreshold/((NC + IC) * NumCalls);
+      SwitchPaneltyApplied = true;
+    }
+
+    // Very high density same call penalty in a block or function.
+    if (!SwitchPaneltyApplied && NumCalls > BonusSameCallBlockLowerBound) {
+      unsigned NumCallsInBlock = 0;
+      unsigned InstrNum = 0;
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; I ++) {
+        if (isa<DbgInfoIntrinsic>(I))
+          continue;
+
+        InstrNum ++;
+        CallSite CS(cast<Value>(I));
+        // If this isn't a call, or it is a call to an intrinsic, it can
+        // never be inlined.
+        if (!CS || isa<IntrinsicInst>(I))
+          continue;
+
+        if (CS.getCalledFunction() == F)
+          NumCallsInBlock ++;
+      }
+
+      double Density = (double)NumCallsInBlock / (double) InstrNum;
+      // Add penalty.
+      if (Density > BonusSameCallDensity)
+        Bonus -= (CSThreshold - CSThreshold/NumCalls) * (1 - Density);
+      else if (NumCalls > BonusSameCallFunctionLowerBound) {
+        unsigned long CallerSize = FunctionSizes[Caller];
+        double Density = (double)NumCalls / (double)CallerSize;
+        // Add penalty.
+        if (Density > BonusSameCallDensity)
+          Bonus -= (CSThreshold - CSThreshold/NumCalls) * (1 - Density);
+      }
+    }
+
+    // Try to not inline big functions when a callee has too many uses.
+    int ArgNum = 0;
+    for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+        I != E; ++I)
+        ArgNum ++;
+    if (Callee->getNumUses() > (CSThreshold * (ArgNum+1)))
+      Bonus -= CSThreshold *
+               (1 -  (double)(CSThreshold * (ArgNum+1)) /
+               (double)Callee->getNumUses());
+
+    // Reverse the penalty of calls.
+    // It is the key operation to make priority queue effective.
+    Bonus += FunctionCallCount[Callee] * BonusReverseCallPenalty;
+
+    // Special intrinsic penalty.
+    for (Function::iterator BB = Callee->begin(), E = Callee->end();
+         BB != E; ++BB) {
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+        CallSite CS(cast<Value>(I));
+
+        if (!CS)
+          continue;
+
+        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
+          switch (II->getIntrinsicID()) {
+          default:
+            break;
+          case Intrinsic::memset:
+          case Intrinsic::memcpy:
+          case Intrinsic::memmove:
+            // They will be expanded to multiple instruction;
+            Bonus -= InlineConstants::InstrCost * 4;
+            break;
+          }
+        }
+      }
+    }
+
+    // Reverse SingleBB Bonus
+    // This step must be the latest step of bonus adjustment to ensure
+    // the computation is correct.
+    int BBCount = 0;
+    for (Function::iterator BB = Callee->begin(), E = Callee->end();
+         BB != E; ++BB) {
+      BBCount ++;
+      if (BBCount > 1)
+        break;
+    }
+    if (BBCount == 1)
+      Bonus -= (Bonus + CSThreshold) / 3;
+
+    return Bonus;
+}
+
+/// UpdateCallSitesAfterInlining - Insert the call sites introduced in previous
+/// inlining into call site queue. Also update the call site weight due to
+/// caller's size change.
+void GreedyInliner::UpdateCallSitesAfterInlining(Function* Caller,
+                         Function* Callee, SmallVector<WeakVH, 8>& InlinedCalls,
+                                                      CallSiteVector& CallSites,
+                                        std::vector<Function*>& InlineHistory) {
+  // Collect CS in the updated caller.
+  CallSiteVector NewCallSites;
+  for (unsigned i = 0, e = InlinedCalls.size(); i != e; ++i) {
+     Value *Ptr = InlinedCalls[i];
+
+     CallSite CS(Ptr);
+
+     // If this isn't a call, or it is a call to an intrinsic, it can
+     // never be inlined.
+     if (!CS || isa<IntrinsicInst>(Ptr))
+       continue;
+
+     // No indirect call.
+     if (CS.getCalledFunction() == NULL)
+       continue;
+
+     // Only declaration.
+     if (CS.getCalledFunction()->isDeclaration())
+       continue;
+
+     if (!isPreferredCS(CS))
+       continue;
+
+     NewCallSites.push_back(std::make_pair(CS,
+                                std::make_pair(0.0, std::vector<Function*>())));
+  }
+  FunctionCallCount[Caller] = FunctionCallCount[Caller] + NewCallSites.size()
+                              - 1;
+  ComputeCallSitesWeight(NewCallSites);
+
+  // update all weights to caller.
+  for (CallSiteVectorImpl::iterator I = CallSites.begin();
+       I != CallSites.end();) {
+    CallSite& CS = I->first;
+    if (CS.getCalledFunction() == Caller) {
+      // Update weight.
+      double Weight = ComputeCallSiteWeight(CS);
+      if (Weight <= InlineWeightThreshold) {
+        DEBUG(dbgs() << "Skip CallSite" << CS.getCaller()->getName()
+                     << " << " <<  CS.getCalledFunction()->getName()
+                     << " [" << Weight << "]\n");
+        I = CallSites.erase(I);
+        continue;
+      }
+      I->second.first = Weight;
+    }
+
+    // Continue.
+    ++I;
+  }
+
+  // Add new call sites into CallSites.
+  for (CallSiteVectorImpl::iterator I =
+       NewCallSites.begin(), E = NewCallSites.end(); I != E; ++I) {
+    CallSite& CS = I->first;
+
+    // Recursive for sure.
+    Function* Callee = CS.getCalledFunction();
+    if (Callee == Caller)
+      continue;
+
+    // Check inline history to prevent recursively inlining.
+    if (std::find(InlineHistory.begin(), InlineHistory.end(), Callee) !=
+        InlineHistory.end())
+      continue;
+
+    I->second.second = InlineHistory;
+    CallSites.push_back(*I);
+  }
+}
+
+/// CollectInvalidCallSites - Collect all call sites from call site queue
+/// if the callee may be deleted during the next inlining.
+void GreedyInliner::CollectInvalidCallSites(Function* Caller,
+       CallSiteVector& CallSites, SmallVector<CallSite, 16>& InvalidCallSites) {
+  for (CallSiteVectorImpl::iterator I =
+       CallSites.begin(), E = CallSites.end(); I != E; ++I) {
+    CallSite& CS = I->first;
+    if (CS.getCaller() != Caller)
+      continue;
+
+    InvalidCallSites.push_back(CS);
+  }
+}
+
+/// CleanInvalidCallSites - Remove call site which caller has been removed due
+/// to previous inlining.
+void GreedyInliner::CleanInvalidCallSites(CallSiteVector& CallSites,
+                                  SmallVector<CallSite, 16>& InvalidCallSites) {
+  for (SmallVectorImpl<CallSite>::iterator II =
+       InvalidCallSites.begin(), IE = InvalidCallSites.end(); II != IE; ++II) {
+    CallSite InvalidCS = *II;
+    for (CallSiteVectorImpl::iterator I =
+        CallSites.begin(); I != CallSites.end(); ) {
+      CallSite& CS = I->first;
+      if (CS != InvalidCS) {
+        ++I;
+        continue;
+      }
+
+      // Remove it.
+      I = CallSites.erase(I);
+    }
+  }
+}
+
+/// ComputeCallSiteWeightBenefit - Compute benefit if inlining the callee.
+int GreedyInliner::ComputeCallSiteWeightBenefit(CallSite& CS) {
+  int Benefit = 1;
+
+  // Check argument
+  for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+       I != E; ++I) {
+    Constant *C = dyn_cast<Constant>(*I);
+    if (C)
+      Benefit += BenefitConstantArgument;
+    else
+      Benefit += BenefitVariableArgument;
+  }
+
+  // Check alloca for scalar replacement of aggregates transformation (SROA).
+  Function *Callee = CS.getCalledFunction();
+  for (Function::iterator BB = Callee->begin(), E = Callee->end();
+       BB != E; ++BB) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      if (isa<DbgInfoIntrinsic>(I))
+        continue;
+
+      if (isa<AllocaInst>(I)) {
+        AllocaInst* AI = cast<AllocaInst>(I);
+        if (AI->isArrayAllocation()) {
+          Benefit += BenefitArrayAlloca;
+        }else if (AI->isStaticAlloca()) {
+          Benefit += BenefitStaticAlloca;
+        }
+      }
+    }
+  }
+
+  // AlwaysInline Benefit - inline alwaysinline functions at first.
+  if (Callee->hasFnAttribute(Attribute::AlwaysInline)) {
+    if (ICA->isInlineViable(*Callee))
+      Benefit += BenefitAlwaysInline;
+    else
+      Benefit -= BenefitAlwaysInline;
+  }
+
+  return Benefit;
+}
+
+/// GetBranchLevel - Compute branch level for the given instruction.
+int GreedyInliner::GetBranchLevel(Function* Caller, Instruction* I)
+{
+    BasicBlock* BB = I->getParent();
+    DominatorTreeWrapperPass* DTWP =
+                                &getAnalysis<DominatorTreeWrapperPass>(*Caller);
+    DominatorTree& DT = DTWP->getDomTree();
+    int IC = 1;
+    DomTreeNode* DN = NULL;
+    if (DT.getNode(BB))
+      DN = DT.getNode(BB)->getIDom();
+    while (DN) {
+      BasicBlock* StmtBB = DN->getBlock();
+      TerminatorInst* I = StmtBB->getTerminator();
+
+      if (const BranchInst *BI = dyn_cast<BranchInst>(I)) {
+        if (BI->isConditional()) {
+          IC ++;
+        }
+      }else if (isa<SwitchInst>(I)) {
+        IC ++;
+      }
+
+      DN = DN->getIDom();
+    }
+
+    return IC;
+}
+
+/// ComputeCallSiteWeight - Compute weight from inputted call site.
+/// Benenfit - A value added to favor a certain feature of a call site
+/// LoopDepthBonus - If a callsite  is inside a loop, it is preferred over
+///                  a call sites not in a loop.
+/// UseBonus - If a callee only has one or two uses, we like to consider
+///            it at first because it has many benefit if it is processed at
+///            first, like reducing code size and the number of functions in
+///            the queue to speed up inliner.
+/// UseCount - If a callee has a lot of caller in the module, we may leave
+///            it to be processed after those only has one or several.
+/// CalleeSize - We should consider smaller function at first. because code size
+///              10 and 15 are not too much difference, sqrt is used to make
+///              callee size denser. some functions could makes it more denser,
+///              however, if too denser, it is difficult to make weight
+///              threshold works well.
+double GreedyInliner::ComputeCallSiteWeight(CallSite& CS) {
+
+  // This weight equation is designed for performance.
+  Function *Callee = CS.getCalledFunction();
+  Function *Caller = CS.getCaller();
+  unsigned long CalleeSize = FunctionSizes[Callee];
+
+  Instruction* Call = CS.getInstruction();
+  BasicBlock* BB = Call->getParent();
+  LoopInfo *LI = &getAnalysis<LoopInfo>(*Caller);
+  unsigned LoopDepthBonus = 1;
+  LoopDepthBonus += LI->getLoopDepth(BB) * LoopDepthBonusFactor;
+
+  double Benefit = (double) ComputeCallSiteWeightBenefit(CS);
+
+  // Favor the tail calls, bottom-up has the fastest processing speed.
+  double UseBonus = 1.0;
+  unsigned int UseCount = Callee->getNumUses();
+  if (UseCount == 1) UseBonus *= OneUseBonusFactor;
+    else if (UseCount == 2) UseBonus *= TwoUseBonusFactor;
+
+  int BranchLevel = 1;
+  if (ApplyBranchLevel) {
+    BranchLevel = GetBranchLevel(Caller, CS.getInstruction());
+    BranchLevel = BranchLevel - LI->getLoopDepth(BB);
+  }
+
+  // FIXME: When profile guided optimization becomes available, we need
+  // introduce the frequency into the weight computation.
+  double Weight = Benefit * LoopDepthBonus * UseBonus * WeightScaleFactor /
+                  (UseCount * sqrt(CalleeSize) * BranchLevel);
+
+  assert(UseCount != 0 && "UserCount is Zero!");
+  assert(CalleeSize != 0 && "CalleeSize is Zero!");
+
+  if (InlineDebugWeight) {
+    DEBUG(dbgs() << "   Weight: [" << Weight << "][" << Benefit
+               << "|" << LoopDepthBonus << "|" << UseBonus << "|" << BranchLevel
+               << "|" << UseCount << "|" << sqrt(CalleeSize) << "]"
+               << Caller->getName()
+               << " <- " << Callee->getName() << "\n");
+  }
+
+  return Weight;
+}
+
+/// GetBestCallSite - Return the call site with maximum weight and remove this
+/// call site from call site queue.
+CallSite GreedyInliner::GetBestCallSite(CallSiteVector& CallSites,
+                                        std::vector<Function*>& InlineHistory) {
+  CallSiteVectorImpl::iterator BestIndex;
+  double MaxWeight = std::numeric_limits<double>::min();
+
+  // SiteCount is always larger than 0.
+  for (CallSiteVectorImpl::iterator I =
+       CallSites.begin(), E = CallSites.end(); I != E; ++I) {
+    double Weight = I->second.first;
+    if (Weight <= MaxWeight)
+      continue;
+
+    BestIndex = I;
+    MaxWeight = Weight;
+  }
+
+  CallSite CS = BestIndex->first;
+  InlineHistory = BestIndex->second.second;
+  CallSites.erase(BestIndex);
+
+  DEBUG(dbgs() << "[" << MaxWeight << "] ");
+
+  return CS;
+}
+
+/// ComputeFunctionSize - Compute the size of a function with the equation.
+/// BasicBlockSizeCost * BasicBlockNumber + InstructionNumber
+unsigned long GreedyInliner::ComputeFunctionSize(Function* F) {
+    // FIXME: This could be more precise by using the way the SCC inliner
+    // used to compute the inline cost. That way can only count the reachable
+    // blocks. If possible, we may reuse the same code with the SCC inliner.
+    if (F->isDeclaration())
+      return 0;
+
+    unsigned long BBCount = 0;
+    unsigned long InstCount = 0;
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      BBCount ++;
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+        if (isa<DbgInfoIntrinsic>(I))
+          continue;
+
+        InstCount ++;
+      }
+    }
+
+    return (InstCount + BasicBlockSizeCost * BBCount);
+}
diff --git a/lib/Transforms/IPO/GreedyInlinerHelper.cpp b/lib/Transforms/IPO/GreedyInlinerHelper.cpp
new file mode 100644
index 0000000..64a33b5
--- /dev/null
+++ b/lib/Transforms/IPO/GreedyInlinerHelper.cpp
@@ -0,0 +1,85 @@
+//===- GreedyInlinerHelper.cpp - helper class for greedy inliner   --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file makes module greedy inliner to call scc inliner possible
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "greedyinlinerhelper"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Transforms/IPO/InlinerPass.h"
+
+using namespace llvm;
+
+namespace {
+
+/// \brief Actual inliner pass implementation.
+///
+/// The common implementation of the inlining logic is shared between this
+/// inliner pass and the always inliner pass. The two passes use different cost
+/// analyses to determine when to inline.
+class GreedyInlinerHelper : public Inliner {
+  InlineCostAnalysis *ICA;
+
+
+public:
+  GreedyInlinerHelper() : Inliner(ID), ICA(0) {
+    initializeGreedyInlinerHelperPass(*PassRegistry::getPassRegistry());
+    setBonusThreshold(0);
+  }
+
+  GreedyInlinerHelper(int Threshold)
+      : Inliner(ID, Threshold, /*InsertLifetime*/ true), ICA(0) {
+    initializeGreedyInlinerHelperPass(*PassRegistry::getPassRegistry());
+    setBonusThreshold(0);
+  }
+
+  static char ID; // Pass identification, replacement for typeid
+
+  InlineCost getInlineCost(CallSite CS) override {
+    return ICA->getInlineCost(CS, getBonusThreshold() + getInlineThreshold(CS));
+  }
+
+  bool runOnSCC(CallGraphSCC &SCC) override;
+};
+
+} // end anonymous namespace
+
+char GreedyInlinerHelper::ID = 0;
+INITIALIZE_PASS_BEGIN(GreedyInlinerHelper, "greedy-inliner-helper",
+       "Function Integration/Inlining Wrapper for Greedy Inliner", false, false)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
+INITIALIZE_PASS_END(GreedyInlinerHelper, "greedy-inliner-helper",
+       "Function Integration/Inlining Wrapper for Greedy Inliner", false, false)
+
+Pass *llvm::createGreedyInlinerHelperPass() { return new GreedyInlinerHelper(); }
+
+Pass *llvm::createGreedyInlinerHelperPass(int Threshold) {
+  return new GreedyInlinerHelper(Threshold);
+}
+
+bool GreedyInlinerHelper::runOnSCC(CallGraphSCC &SCC) {
+  ICA = new InlineCostAnalysis();
+  ICA->setResolver(getResolver());
+  ICA->runOnSCC(SCC);
+  return Inliner::runOnSCC(SCC);
+}
+
+
+
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 9087ab2..fe82aa1 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -48,7 +48,7 @@ static cl::opt<int>
 InlineLimit("inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
         cl::desc("Control the amount of inlining to perform (default = 225)"));
 
-static cl::opt<int>
+cl::opt<int>
 HintThreshold("inlinehint-threshold", cl::Hidden, cl::init(325),
               cl::desc("Threshold for inlining functions with inline hint"));
 
@@ -463,7 +463,10 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
   // index into the InlineHistory vector.
   SmallVector<std::pair<Function*, int>, 8> InlineHistory;
 
-  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+  if (PreferredCS.getInstruction() != NULL)
+    CallSites.push_back(std::make_pair(PreferredCS, -1));
+  else for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E;
+                                                                          ++I) {
     Function *F = (*I)->getFunction();
     if (!F) continue;
     
@@ -502,6 +505,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
   
   InlinedArrayAllocasTy InlinedArrayAllocas;
   InlineFunctionInfo InlineInfo(&CG, DL);
+  InlinedCalls.clear();
   
   // Now that we have all of the call sites, loop over them and inline them if
   // it looks profitable to do so.
@@ -575,7 +579,10 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
 
         // If inlining this function gave us any new call sites, throw them
         // onto our worklist to process.  They are useful inline candidates.
-        if (!InlineInfo.InlinedCalls.empty()) {
+        if (PreferredCS.getInstruction() != NULL)
+          InlinedCalls.insert(InlinedCalls.end(),
+                InlineInfo.InlinedCalls.begin(), InlineInfo.InlinedCalls.end());
+        else if (!InlineInfo.InlinedCalls.empty()) {
           // Create a new inline history entry for this, so that we remember
           // that these new callsites came about due to inlining Callee.
           int NewHistoryID = InlineHistory.size();
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 98477b5..92fc1ab 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -57,6 +57,11 @@ static cl::opt<bool> RunLoadCombine("combine-loads", cl::init(false),
                                     cl::Hidden,
                                     cl::desc("Run the load combining pass"));
 
+extern cl::opt<bool> EnableGreedyInliner;
+
+cl::opt<bool> LTOFileEmit("lto-file-emit", cl::init(false), cl::ZeroOrMore,
+       cl::desc("Indicate the phase where LTO is emitting for one source"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -170,12 +175,21 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   // Start of CallGraph SCC passes.
   if (!DisableUnitAtATime)
     MPM.add(createPruneEHPass());             // Remove dead EH info
+
   if (Inliner) {
+    // FIXME: GreedyInliner is a module pass so that the function attrs pass
+    // has been called explicitly before the inliner pass. Due to the bug of
+    // function attrs pass, it cannot run again after inlining. This should be
+    // fixed in the future.
+    if (EnableGreedyInliner)
+      MPM.add(createFunctionAttrsPass());     // Set readonly/readnone attrs
     MPM.add(Inliner);
     Inliner = nullptr;
   }
-  if (!DisableUnitAtATime)
-    MPM.add(createFunctionAttrsPass());       // Set readonly/readnone attrs
+
+  if (!DisableUnitAtATime && !EnableGreedyInliner)
+    MPM.add(createFunctionAttrsPass()); // Set readonly/readnone attrs
+
   if (OptLevel > 2)
     MPM.add(createArgumentPromotionPass());   // Scalarize uninlined fn args
 
@@ -319,8 +333,12 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   addExtensionsToPM(EP_Peephole, PM);
 
   // Inline small functions
-  if (RunInliner)
-    PM.add(createFunctionInliningPass());
+  if (RunInliner) {
+    if (EnableGreedyInliner)
+      PM.add(createGreedyInliningPass(OptLevel));
+    else
+      PM.add(createFunctionInliningPass());
+  }
 
   PM.add(createPruneEHPass());   // Remove dead EH info.
 
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 6ba6340..36568a3 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -104,6 +104,8 @@ StripDebug("strip-debug",
 static cl::opt<bool>
 DisableInline("disable-inlining", cl::desc("Do not run the inliner pass"));
 
+extern cl::opt<bool> EnableGreedyInliner;
+
 static cl::opt<bool>
 DisableOptimizations("disable-opt",
                      cl::desc("Do not run any optimization passes"));
@@ -186,7 +188,6 @@ DefaultDataLayout("default-data-layout",
           cl::value_desc("layout-string"), cl::init(""));
 
 
-
 static inline void addPass(PassManagerBase &PM, Pass *P) {
   // Add the pass to the pass manager...
   PM.add(P);
@@ -215,7 +216,17 @@ static void AddOptimizationPasses(PassManagerBase &MPM,FunctionPassManager &FPM,
   if (DisableInline) {
     // No inlining pass
   } else if (OptLevel > 1) {
-    Builder.Inliner = createFunctionInliningPass(OptLevel, SizeLevel);
+    if (EnableGreedyInliner) {
+      unsigned Threshold = 225;
+      if (SizeLevel == 1)
+        Threshold = 75;
+      else if (SizeLevel == 2)
+        Threshold = 25;
+      if (OptLevel > 2)
+        Threshold = 275;
+      Builder.Inliner = createGreedyInliningPass(275, OptLevel);
+    } else
+      Builder.Inliner = createFunctionInliningPass(OptLevel, SizeLevel);
   } else {
     Builder.Inliner = createAlwaysInlinerPass();
   }
@@ -252,8 +263,13 @@ static void AddStandardCompilePasses(PassManagerBase &PM) {
 
   // -std-compile-opts adds the same module passes as -O3.
   PassManagerBuilder Builder;
-  if (!DisableInline)
-    Builder.Inliner = createFunctionInliningPass();
+  if (!DisableInline) {
+    if (EnableGreedyInliner)
+      Builder.Inliner = createGreedyInliningPass();
+    else
+      Builder.Inliner = createFunctionInliningPass();
+  }
+
   Builder.OptLevel = 3;
   Builder.populateModulePassManager(PM);
 }
-- 
1.8.2.1

