As scratch memory is a limited resource in HW. And different register have the opptunity to share same scratch memory. So I implement a special designed allocator to reduce scratch memory usage, it is assumed that scatch memory are always allocated in multiple of 32 bytes.
Signed-off-by: Ruiling Song <[email protected]> --- backend/src/backend/context.cpp | 149 +++++++++++++++++++++++++++- backend/src/backend/context.hpp | 28 +++++- backend/src/backend/gen_reg_allocation.cpp | 50 ++++++++-- 3 files changed, 212 insertions(+), 15 deletions(-) diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp index 2125bd1..24f4f6f 100644 --- a/backend/src/backend/context.cpp +++ b/backend/src/backend/context.cpp @@ -301,6 +301,145 @@ namespace gbe return i; } + /* + * As scrath memory are allocated by multiple of fixedSize(32), we design the simple allocator + * to make allocation and free efficient. A ScratchBlock is multiple of fixedSize with one head, + * and possibly one tail and internal elements, each element are defined as [size, used, prev, next], + * The block header's size record the real block size, other elements in the block are with size 0. + * When an element is block header, its [prev/next] are used for free block list. + * The tail block's prev points to block header. When a block is freed, it will check + * left and right blocks for possible merge. + */ + // XXX TODO as we optimize scratch memory usage using the register interval. + // we need to add some dependency in post_reg_alloc scheduler, to keep scratch + // memory that are reused still keep the order +int ScratchAllocator::allocate(int bytes) { + GBE_ASSERT(bytes % fixedSize == 0); + int size = bytes / fixedSize; + int target = free; + while(target != -1 && blocks[target].size < size) { + target = blocks[target].next; + } + + if(target == -1) { + // no suitable block found, simply allocate enough + int ret = maxOffset*fixedSize; + blocks.push_back(ScratchBlock(size, 1, -1, -1)); + for(int i = 0; i < size-1; i++) + blocks.push_back(ScratchBlock(0, 1, maxOffset, -1)); + maxOffset = blocks.size(); + return ret; + } else { + // found one block large enough + int old_size = blocks[target].size; + for(int i = 0; i < size; i++) { + blocks[target + i].used = 1; + blocks[target + i].size = 0; + } + blocks[target].size = size; + + if(old_size > size) { + // split the larger block + if(size > 1) + blocks[target + size - 1].prev = target; + + // setup new free block + int new_block = target + size; + int new_size = old_size - size; + blocks[new_block].size = new_size; + blocks[new_block].prev = blocks[target].prev; + blocks[new_block].next = blocks[target].next; + if(new_size > 1) + blocks[new_block + new_size - 1].prev = new_block; + + // setup free list + if(blocks[target].prev != -1) + blocks[blocks[target].prev].next = new_block; + else { + GBE_ASSERT(free == target); + free = new_block; + } + if(blocks[target].next != -1) + blocks[blocks[target].next].prev = new_block; + } else { + // of same size + if(blocks[target].prev == -1) { + GBE_ASSERT(free == target); + free = blocks[target].next; + }else { + blocks[blocks[target].prev].next = blocks[target].next; + } + + if(blocks[target].next != -1) + blocks[blocks[target].next].prev = blocks[target].prev; + } + blocks[target].prev = blocks[target].next = -1; + + return target * fixedSize; + } + } + void ScratchAllocator::dumpAllBlocks() { + std::cout << "---All Scratch Blocks Dumped (with free = " << free << ")---" << std::endl; + for(int i = 0; i < blocks.size(); i++) { + std::cout << "[" << i << ", sz(" << (int)blocks[i].size << "), u(" << (int)blocks[i].used + << "), p("<< blocks[i].prev << "), n(" << blocks[i].next << ")]"<<std::endl; + } + } + void ScratchAllocator::deallocate(int offset) { + offset /= fixedSize; + int size = blocks[offset].size; + for(int i = 0; i < size; i++) { + blocks[offset+i].used = 0; + blocks[offset+i].next = -1; + } + + int head = offset; + // check left can be merged? + if(offset > 0 && blocks[offset-1].used == 0) { + int lblock_head = offset -1; + if(blocks[offset -1].size == 0) // block tail + lblock_head = blocks[offset-1].prev; + blocks[lblock_head].size += size; + blocks[offset+size-1].prev = lblock_head; // point to new block head + blocks[offset].size = 0; // not head now + GBE_ASSERT(blocks[offset+size-1].size == 0); + head = lblock_head; + } + + int rblock_head = offset+size; + // check right can be merged? + if(rblock_head < maxOffset && blocks[rblock_head].used == 0) { + int rblock_size = blocks[rblock_head].size; + blocks[head].size += rblock_size; + GBE_ASSERT(rblock_size != 0); + blocks[rblock_head].size = 0; // not head now + + // let rblock's prev directly point to rblock's next + if(blocks[rblock_head].prev != -1) + blocks[blocks[rblock_head].prev].next = blocks[rblock_head].next; + else + free = blocks[rblock_head].next; + + if(blocks[rblock_head].next != -1) + blocks[blocks[rblock_head].next].prev = blocks[rblock_head].prev; + + blocks[rblock_head + rblock_size - 1].prev = head; //points to new block head + blocks[rblock_head + rblock_size - 1].next = -1; + } + + // insert the block at the front of free list + if(blocks[head].prev == -1) { + // 'head' maybe 'free' or the newly deallocated + int old_free = free; + + if(old_free != -1 && old_free != head) { + blocks[old_free].prev = head; + blocks[head].next = old_free; + } + free = head; + } + } + /////////////////////////////////////////////////////////////////////////// // Generic Context (shared by the simulator and the HW context) /////////////////////////////////////////////////////////////////////////// @@ -317,7 +456,6 @@ namespace gbe this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH); else this->simdWidth = fn.getSimdWidth(); - this->scratchOffset = 0; } Context::~Context(void) { @@ -340,7 +478,7 @@ namespace gbe this->kernel = NULL; } if(this->kernel != NULL) { - this->kernel->scratchSize = alignScratchSize(this->scratchOffset); + this->kernel->scratchSize = alignScratchSize(scratchAllocator.getMaxSize()); this->kernel->ctx = this; } return this->kernel; @@ -378,9 +516,10 @@ namespace gbe } uint32_t Context::allocateScratchMem(uint32_t size) { - uint32_t offset = scratchOffset; - scratchOffset += size; - return offset; + return scratchAllocator.allocate(size); + } + void Context::deallocateScratchMem(uint32_t offset) { + scratchAllocator.deallocate(offset); } void Context::buildStack(void) { diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp index 000612e..a63ddb3 100644 --- a/backend/src/backend/context.hpp +++ b/backend/src/backend/context.hpp @@ -43,6 +43,30 @@ namespace gbe class Kernel; // context creates Kernel class RegisterFilePartitioner; // Partition register file for reg allocation + struct ScratchBlock { + ScratchBlock(uint8_t _size, uint8_t _used, int _prev, int _next) + :size(_size), used(_used), prev(_prev), next(_next) + {} + uint8_t size; //!< size of the block if this the head, otherwise should be 0 + uint8_t used; //!< used: 1 free: 0 + int prev; //!< if this is block head, it means previous free block, otherwise point to block head + int next; //!< used: -1, free: next free block + }; + + class ScratchAllocator { + public: + ScratchAllocator(): free(-1), maxOffset(0) {} + ~ScratchAllocator() { blocks.clear(); } + int allocate(int size); + void deallocate(int offset); + int getMaxSize() { return maxOffset* fixedSize; } + void dumpAllBlocks(); + private: + std::vector<ScratchBlock> blocks; + int free; + int maxOffset; + static const int fixedSize = 32; + }; /*! Context is the helper structure to build the Gen ISA or simulation code * from GenIR */ @@ -95,6 +119,8 @@ namespace gbe uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size); /*! allocate size scratch memory and return start address */ uint32_t allocateScratchMem(uint32_t size); + /*! deallocate scratch memory at offset */ + void deallocateScratchMem(uint32_t offset); /*! Preallocated curbe register set including special registers. */ map<ir::Register, uint32_t> curbeRegs; protected: @@ -133,7 +159,7 @@ namespace gbe set<ir::LabelIndex> usedLabels; //!< Set of all used labels JIPMap JIPs; //!< Where to jump all labels/branches uint32_t simdWidth; //!< Number of lanes per HW threads - uint32_t scratchOffset; //!< scratch slot for next scratch memory request + ScratchAllocator scratchAllocator; //!< scratch memory allocator GBE_CLASS(Context); //!< Use custom allocators }; diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp index 726b78c..2bab785 100644 --- a/backend/src/backend/gen_reg_allocation.cpp +++ b/backend/src/backend/gen_reg_allocation.cpp @@ -177,7 +177,7 @@ namespace gbe INLINE bool spillReg(GenRegInterval interval, bool isAllocated = false); INLINE bool spillReg(ir::Register reg, bool isAllocated = false); INLINE bool vectorCanSpill(SelectionVector *vector); - + INLINE void allocateScratchForSpilled(); /*! Use custom allocator */ GBE_CLASS(Opaque); }; @@ -576,6 +576,8 @@ namespace gbe } if (!spilledRegs.empty()) { GBE_ASSERT(reservedReg != 0); + allocateScratchForSpilled(); + bool success = selection.spillRegs(spilledRegs, reservedReg); if (!success) { std::cerr << "Fail to spill registers." << std::endl; @@ -585,6 +587,43 @@ namespace gbe return true; } + INLINE void GenRegAllocator::Opaque::allocateScratchForSpilled() + { + const uint32_t regNum = spilledRegs.size(); + this->starting.resize(regNum); + this->ending.resize(regNum); + uint32_t regID = 0; + for(auto it = spilledRegs.begin(); it != spilledRegs.end(); ++it) { + this->starting[regID] = this->ending[regID] = &intervals[it->first]; + regID++; + } + std::sort(this->starting.begin(), this->starting.end(), cmp<true>); + std::sort(this->ending.begin(), this->ending.end(), cmp<false>); + int toExpire = 0; + for(uint32_t i = 0; i < regNum; i++) { + const GenRegInterval * cur = starting[i]; + const GenRegInterval * exp = ending[toExpire]; + if(exp->maxID <= cur->minID) { + auto it = spilledRegs.find(exp->reg); + GBE_ASSERT(it != spilledRegs.end()); + if(it->second.addr != -1) { + ctx.deallocateScratchMem(it->second.addr); + } + toExpire++; + } + auto it = spilledRegs.find(cur->reg); + GBE_ASSERT(it != spilledRegs.end()); + if(cur->minID == cur->maxID) { + it->second.addr = -1; + continue; + } + + ir::RegisterFamily family = ctx.sel->getRegisterFamily(cur->reg); + it->second.addr = ctx.allocateScratchMem(getFamilySize(family) + * ctx.getSimdWidth()); + } + } + INLINE bool GenRegAllocator::Opaque::expireReg(ir::Register reg) { auto it = RA.find(reg); @@ -636,14 +675,7 @@ namespace gbe return false; SpillRegTag spillTag; spillTag.isTmpReg = interval.maxID == interval.minID; - if (!spillTag.isTmpReg) { - // FIXME, we can optimize scratch allocation according to - // the interval information. - ir::RegisterFamily family = ctx.sel->getRegisterFamily(interval.reg); - spillTag.addr = ctx.allocateScratchMem(getFamilySize(family) - * ctx.getSimdWidth()); - } else - spillTag.addr = -1; + if (isAllocated) { // If this register is allocated, we need to expire it and erase it // from the RA map. -- 1.7.9.5 _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
