Good job! Ruiling. In general, this patch looks good to me. And as we just discussed offline, after you submmitted a fix for the SLM value allocation bug, I will do a fully test on this patch. If everything is ok, will push it then. Thanks.
On Thu, May 08, 2014 at 10:18:22AM +0800, Ruiling Song wrote: > Gen support at most 4 DWORD read/write in one single instruction. > So we merge successive read/write for less instruction and better performance. > This improves about 10% for LuxMark medium scene. > > Signed-off-by: Ruiling Song <[email protected]> > --- > backend/src/CMakeLists.txt | 1 + > backend/src/llvm/llvm_gen_backend.hpp | 3 + > backend/src/llvm/llvm_loadstore_optimization.cpp | 269 > ++++++++++++++++++++++ > backend/src/llvm/llvm_to_gen.cpp | 3 +- > 4 files changed, 275 insertions(+), 1 deletion(-) > create mode 100644 backend/src/llvm/llvm_loadstore_optimization.cpp > > diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt > index 2d59644..3bb31e5 100644 > --- a/backend/src/CMakeLists.txt > +++ b/backend/src/CMakeLists.txt > @@ -146,6 +146,7 @@ else (GBE_USE_BLOB) > llvm/llvm_intrinsic_lowering.cpp > llvm/llvm_barrier_nodup.cpp > llvm/llvm_to_gen.cpp > + llvm/llvm_loadstore_optimization.cpp > llvm/llvm_gen_backend.hpp > llvm/llvm_gen_ocl_function.hxx > llvm/llvm_to_gen.hpp > diff --git a/backend/src/llvm/llvm_gen_backend.hpp > b/backend/src/llvm/llvm_gen_backend.hpp > index 56dd27f..26323a3 100644 > --- a/backend/src/llvm/llvm_gen_backend.hpp > +++ b/backend/src/llvm/llvm_gen_backend.hpp > @@ -84,6 +84,9 @@ namespace gbe > /*! Remove the GEP instructions */ > llvm::BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit); > > + /*! Merge load/store if possible */ > + llvm::BasicBlockPass *createLoadStoreOptimizationPass(); > + > /*! Scalarize all vector op instructions */ > llvm::FunctionPass* createScalarizePass(); > /*! Remove/add NoDuplicate function attribute for barrier functions. */ > diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp > b/backend/src/llvm/llvm_loadstore_optimization.cpp > new file mode 100644 > index 0000000..a597927 > --- /dev/null > +++ b/backend/src/llvm/llvm_loadstore_optimization.cpp > @@ -0,0 +1,269 @@ > +/* > + * Copyright © 2012 Intel Corporation > + * > + * This library is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2 of the License, or (at your option) any later version. > + * > + * This library is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with this library. If not, see > <http://www.gnu.org/licenses/>. > + * > + * Author: Ruiling, Song <[email protected]> > + * > + * The Idea is that: As GEN support at most 4 successive DWORD load/store, > + * then merge successive load/store that are compatible is beneficial. > + * The method of checking whether two load/store is compatible are borrowed > + * from Vectorize passes in llvm. > + */ > + > +#include "llvm/IR/Instructions.h" > +#include "llvm/Pass.h" > +#include "llvm/PassManager.h" > + > +#include "llvm/Config/config.h" > +#include "llvm/ADT/DenseMap.h" > +#include "llvm/ADT/PostOrderIterator.h" > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2 > +#include "llvm/Function.h" > +#include "llvm/InstrTypes.h" > +#include "llvm/Instructions.h" > +#include "llvm/IntrinsicInst.h" > +#include "llvm/Module.h" > +#else > +#include "llvm/IR/Function.h" > +#include "llvm/IR/InstrTypes.h" > +#include "llvm/IR/Instructions.h" > +#include "llvm/IR/IntrinsicInst.h" > +#include "llvm/IR/Module.h" > +#endif /* LLVM_VERSION_MINOR <= 2 */ > +#include "llvm/Pass.h" > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 1 > +#include "llvm/Support/IRBuilder.h" > +#elif LLVM_VERSION_MINOR == 2 > +#include "llvm/IRBuilder.h" > +#else > +#include "llvm/IR/IRBuilder.h" > +#endif /* LLVM_VERSION_MINOR <= 1 */ > +#include "llvm/Support/CallSite.h" > +#include "llvm/Support/CFG.h" > +#include "llvm/Support/raw_ostream.h" > +#include "llvm/Analysis/ScalarEvolution.h" > +#include "llvm/Analysis/ScalarEvolutionExpressions.h" > + > +using namespace llvm; > +namespace gbe { > + class GenLoadStoreOptimization : public BasicBlockPass { > + > + public: > + static char ID; > + ScalarEvolution *SE; > + DataLayout *TD; > + GenLoadStoreOptimization() : BasicBlockPass(ID) {} > + > + void getAnalysisUsage(AnalysisUsage &AU) const { > + AU.addRequired<ScalarEvolution>(); > + AU.addPreserved<ScalarEvolution>(); > + AU.setPreservesCFG(); > + } > + > + virtual bool runOnBasicBlock(BasicBlock &BB) { > + SE = &getAnalysis<ScalarEvolution>(); > + TD = getAnalysisIfAvailable<DataLayout>(); > + return optimizeLoadStore(BB); > + } > + Type *getValueType(Value *insn); > + Value *getPointerOperand(Value *I); > + unsigned getAddressSpace(Value *I); > + bool isSimpleLoadStore(Value *I); > + bool optimizeLoadStore(BasicBlock &BB); > + > + bool isLoadStoreCompatible(Value *A, Value *B); > + void mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged); > + void mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> > &merged); > + BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB, > + SmallVector<Instruction*, 4> > &merged, > + BasicBlock::iterator &start, > + unsigned maxLimit, > + bool isLoad); > + > + virtual const char *getPassName() const { > + return "Merge compatible Load/stores for Gen"; > + } > + }; > + > + char GenLoadStoreOptimization::ID = 0; > + > + Value *GenLoadStoreOptimization::getPointerOperand(Value *I) { > + if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand(); > + if (StoreInst *SI = dyn_cast<StoreInst>(I)) return > SI->getPointerOperand(); > + return NULL; > + } > + unsigned GenLoadStoreOptimization::getAddressSpace(Value *I) { > + if (LoadInst *L=dyn_cast<LoadInst>(I)) return > L->getPointerAddressSpace(); > + if (StoreInst *S=dyn_cast<StoreInst>(I)) return > S->getPointerAddressSpace(); > + return -1; > + } > + bool GenLoadStoreOptimization::isSimpleLoadStore(Value *I) { > + if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->isSimple(); > + if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->isSimple(); > + return false; > + } > + Type *GenLoadStoreOptimization::getValueType(Value *insn) { > + if(LoadInst *ld = dyn_cast<LoadInst>(insn)) return ld->getType(); > + if(StoreInst *st = dyn_cast<StoreInst>(insn)) return > st->getValueOperand()->getType(); > + > + return NULL; > + } > + > + bool GenLoadStoreOptimization::isLoadStoreCompatible(Value *A, Value *B) { > + Value *ptrA = getPointerOperand(A); > + Value *ptrB = getPointerOperand(B); > + unsigned ASA = getAddressSpace(A); > + unsigned ASB = getAddressSpace(B); > + > + // Check that the address spaces match and that the pointers are valid. > + if (!ptrA || !ptrB || (ASA != ASB)) return false; > + > + if(!isSimpleLoadStore(A) || !isSimpleLoadStore(B)) return false; > + // Check that A and B are of the same type. > + if (ptrA->getType() != ptrB->getType()) return false; > + > + // Calculate the distance. > + const SCEV *ptrSCEVA = SE->getSCEV(ptrA); > + const SCEV *ptrSCEVB = SE->getSCEV(ptrB); > + const SCEV *offsetSCEV = SE->getMinusSCEV(ptrSCEVA, ptrSCEVB); > + const SCEVConstant *constOffSCEV = dyn_cast<SCEVConstant>(offsetSCEV); > + > + // Non constant distance. > + if (!constOffSCEV) return false; > + > + int64_t offset = constOffSCEV->getValue()->getSExtValue(); > + Type *Ty = cast<PointerType>(ptrA->getType())->getElementType(); > + // The Instructions are connsecutive if the size of the first load/store > is > + // the same as the offset. > + int64_t sz = TD->getTypeStoreSize(Ty); > + return ((-offset) == sz); > + } > + > + void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, > SmallVector<Instruction*, 4> &merged) { > + IRBuilder<> Builder(&BB); > + > + unsigned size = merged.size(); > + SmallVector<Value *, 4> values; > + for(unsigned i = 0; i < size; i++) { > + values.push_back(merged[i]); > + } > + LoadInst *ld = cast<LoadInst>(merged[0]); > + unsigned align = ld->getAlignment(); > + unsigned addrSpace = ld->getPointerAddressSpace(); > + // insert before first load > + Builder.SetInsertPoint(ld); > + VectorType *vecTy = VectorType::get(ld->getType(), size); > + Value *vecPtr = Builder.CreateBitCast(ld->getPointerOperand(), > + PointerType::get(vecTy, > addrSpace)); > + LoadInst *vecValue = Builder.CreateLoad(vecPtr); > + vecValue->setAlignment(align); > + > + for (unsigned i = 0; i < size; ++i) { > + Value *S = Builder.CreateExtractElement(vecValue, Builder.getInt32(i)); > + values[i]->replaceAllUsesWith(S); > + } > + } > + > + BasicBlock::iterator > + GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB, > + SmallVector<Instruction*, 4> &merged, > + BasicBlock::iterator &start, > + unsigned maxLimit, > + bool isLoad) { > + > + BasicBlock::iterator stepForward = start; > + if(!isSimpleLoadStore(start)) return stepForward; > + > + merged.push_back(start); > + > + BasicBlock::iterator E = BB.end(); > + BasicBlock::iterator J = ++start; > + > + for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) { > + if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) { > + if(isLoadStoreCompatible(merged[merged.size()-1], J)) { > + merged.push_back(J); > + stepForward = ++J; > + } > + } else if((isLoad && isa<StoreInst>(*J)) || (!isLoad && > isa<LoadInst>(*J))) { > + // simple stop to keep read/write order > + break; > + } > + > + if(merged.size() >= 4) break; > + } > + return stepForward; > + } > + > + void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, > SmallVector<Instruction*, 4> &merged) { > + IRBuilder<> Builder(&BB); > + > + unsigned size = merged.size(); > + SmallVector<Value *, 4> values; > + for(unsigned i = 0; i < size; i++) { > + values.push_back(cast<StoreInst>(merged[i])->getValueOperand()); > + } > + StoreInst *st = cast<StoreInst>(merged[0]); > + unsigned addrSpace = st->getPointerAddressSpace(); > + > + unsigned align = st->getAlignment(); > + // insert before the last store > + Builder.SetInsertPoint(merged[size-1]); > + > + Type *dataTy = st->getValueOperand()->getType(); > + VectorType *vecTy = VectorType::get(dataTy, size); > + Value * parent = UndefValue::get(vecTy); > + for(unsigned i = 0; i < size; i++) { > + parent = Builder.CreateInsertElement(parent, values[i], > ConstantInt::get(IntegerType::get(st->getContext(), 32), i)); > + } > + > + Value *newPtr = Builder.CreateBitCast(st->getPointerOperand(), > PointerType::get(vecTy, addrSpace)); > + StoreInst *newST = Builder.CreateStore(parent, newPtr); > + newST->setAlignment(align); > + } > + > + bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) { > + bool changed = false; > + SmallVector<Instruction*, 4> merged; > + for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != > E;++BBI) { > + if(isa<LoadInst>(*BBI) || isa<StoreInst>(*BBI)) { > + bool isLoad = isa<LoadInst>(*BBI) ? true: false; > + Type *ty = getValueType(BBI); > + if(ty->isVectorTy()) continue; > + // we only support DWORD data type merge > + if(!ty->isFloatTy() && !ty->isIntegerTy(32)) continue; > + BBI = findConsecutiveAccess(BB, merged, BBI, 10, isLoad); > + if(merged.size() > 1) { > + if(isLoad) > + mergeLoad(BB, merged); > + else > + mergeStore(BB, merged); > + // remove merged insn > + int size = merged.size(); > + for(int i = 0; i < size; i++) > + merged[i]->eraseFromParent(); > + changed = true; > + } > + merged.clear(); > + } > + } > + return changed; > + } > + > + BasicBlockPass *createLoadStoreOptimizationPass() { > + return new GenLoadStoreOptimization(); > + } > +}; > + > diff --git a/backend/src/llvm/llvm_to_gen.cpp > b/backend/src/llvm/llvm_to_gen.cpp > index 37a5b2b..9282b3f 100644 > --- a/backend/src/llvm/llvm_to_gen.cpp > +++ b/backend/src/llvm/llvm_to_gen.cpp > @@ -187,7 +187,7 @@ namespace gbe > runModulePass(mod, libraryInfo, optLevel); > > llvm::PassManager passes; > - > + passes.add(new DataLayout(&mod)); > // Print the code before further optimizations > if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS) > #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5 > @@ -198,6 +198,7 @@ namespace gbe > passes.add(createIntrinsicLoweringPass()); > passes.add(createFunctionInliningPass(200000)); > passes.add(createScalarReplAggregatesPass()); // Break up allocas > + passes.add(createLoadStoreOptimizationPass()); > passes.add(createRemoveGEPPass(unit)); > passes.add(createConstantPropagationPass()); > passes.add(createLowerSwitchPass()); > -- > 1.7.10.4 > > _______________________________________________ > Beignet mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
