initialize the data inside kernel with packed integer vector Signed-off-by: Guo Yejun <yejun....@intel.com> --- backend/src/backend/context.cpp | 66 ++---------------------------- backend/src/backend/context.hpp | 61 ++++++++++++++++++++++++++- backend/src/backend/gen75_context.cpp | 4 +- backend/src/backend/gen_context.cpp | 33 ++++++++++++++- backend/src/backend/gen_context.hpp | 2 + backend/src/backend/gen_reg_allocation.cpp | 26 +++++++----- backend/src/backend/program.h | 1 - backend/src/backend/program.hpp | 2 +- src/cl_command_queue_gen7.c | 9 ---- 9 files changed, 116 insertions(+), 88 deletions(-)
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp index b8dfa8c..16b5961 100644 --- a/backend/src/backend/context.cpp +++ b/backend/src/backend/context.cpp @@ -35,66 +35,6 @@ namespace gbe { - class SimpleAllocator - { - public: - SimpleAllocator(int16_t startOffset, int16_t size, bool _assertFail); - ~SimpleAllocator(void); - - /*! Allocate some memory from the pool. - */ - int16_t allocate(int16_t size, int16_t alignment, bool bFwd=false); - - /*! Free the given register file piece */ - void deallocate(int16_t offset); - - /*! Spilt a block into 2 blocks */ - void splitBlock(int16_t offset, int16_t subOffset); - - protected: - /*! Double chained list of free spaces */ - struct Block { - Block(int16_t offset, int16_t size) : - prev(NULL), next(NULL), offset(offset), size(size) {} - Block *prev, *next; //!< Previous and next free blocks - int16_t offset; //!< Where the free block starts - int16_t size; //!< Size of the free block - }; - - /*! Try to coalesce two blocks (left and right). They must be in that order. - * If the colascing was done, the left block is deleted - */ - void coalesce(Block *left, Block *right); - /*! the maximum offset */ - int16_t maxOffset; - /*! whether trigger an assertion on allocation failure */ - bool assertFail; - /*! Head and tail of the free list */ - Block *head; - Block *tail; - /*! Handle free list element allocation */ - DECL_POOL(Block, blockPool); - /*! Track allocated memory blocks <offset, size> */ - map<int16_t, int16_t> allocatedBlocks; - /*! Use custom allocators */ - GBE_CLASS(SimpleAllocator); - }; - - /*! Structure that keeps track of allocation in the register file. This is - * actually needed by Context (and not only by GenContext) because both - * simulator and hardware have to deal with constant pushing which uses the - * register file - * - * Since Gen is pretty flexible, we just reuse the Simpleallocator - */ - - class RegisterAllocator: public SimpleAllocator { - public: - RegisterAllocator(int16_t offset, int16_t size): SimpleAllocator(offset, size, false) {} - - GBE_CLASS(RegisterAllocator); - }; - /*! * an allocator for scratch memory allocation. Scratch memory are used for register spilling. * You can query how much scratch memory needed through getMaxScatchMemUsed(). @@ -396,10 +336,10 @@ namespace gbe void Context::buildStack(void) { const auto &stackUse = dag->getUse(ir::ocl::stackptr); - if (stackUse.size() == 0) // no stack is used if stackptr is unused + if (stackUse.size() == 0) { // no stack is used if stackptr is unused + this->kernel->stackSize = 0; return; - // Be sure that the stack pointer is set - // GBE_ASSERT(this->kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0); + } uint32_t stackSize = 128; while (stackSize < fn.getStackSize()) { stackSize *= 3; diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp index faa7c8a..48b46ed 100644 --- a/backend/src/backend/context.hpp +++ b/backend/src/backend/context.hpp @@ -42,9 +42,68 @@ namespace ir { namespace gbe { class Kernel; // context creates Kernel - class RegisterAllocator; // allocator for physical register allocation class ScratchAllocator; // allocator for scratch memory allocation + class SimpleAllocator + { + public: + SimpleAllocator(int16_t startOffset, int16_t size, bool _assertFail); + ~SimpleAllocator(void); + + /*! Allocate some memory from the pool. + */ + int16_t allocate(int16_t size, int16_t alignment, bool bFwd=false); + + /*! Free the given register file piece */ + void deallocate(int16_t offset); + + /*! Spilt a block into 2 blocks */ + void splitBlock(int16_t offset, int16_t subOffset); + + protected: + /*! Double chained list of free spaces */ + struct Block { + Block(int16_t offset, int16_t size) : + prev(NULL), next(NULL), offset(offset), size(size) {} + Block *prev, *next; //!< Previous and next free blocks + int16_t offset; //!< Where the free block starts + int16_t size; //!< Size of the free block + }; + + /*! Try to coalesce two blocks (left and right). They must be in that order. + * If the colascing was done, the left block is deleted + */ + void coalesce(Block *left, Block *right); + /*! the maximum offset */ + int16_t maxOffset; + /*! whether trigger an assertion on allocation failure */ + bool assertFail; + /*! Head and tail of the free list */ + Block *head; + Block *tail; + /*! Handle free list element allocation */ + DECL_POOL(Block, blockPool); + /*! Track allocated memory blocks <offset, size> */ + map<int16_t, int16_t> allocatedBlocks; + /*! Use custom allocators */ + GBE_CLASS(SimpleAllocator); + }; + + /*! Structure that keeps track of allocation in the register file. This is + * actually needed by Context (and not only by GenContext) because both + * simulator and hardware have to deal with constant pushing which uses the + * register file + * + * Since Gen is pretty flexible, we just reuse the Simpleallocator + */ + + class RegisterAllocator: public SimpleAllocator { + public: + RegisterAllocator(int16_t offset, int16_t size): SimpleAllocator(offset, size, false) {} + + GBE_CLASS(RegisterAllocator); + }; + /*! Context is the helper structure to build the Gen ISA or simulation code * from GenIR */ diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp index b9dfb18..7d407c3 100644 --- a/backend/src/backend/gen75_context.cpp +++ b/backend/src/backend/gen75_context.cpp @@ -67,7 +67,7 @@ namespace gbe using namespace ir; // Only emit stack pointer computation if we use a stack - if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0) + if (kernel->getStackSize() == 0) return; // Check that everything is consistent in the kernel code @@ -80,6 +80,8 @@ namespace gbe GenRegister::ud16grf(ir::ocl::stackptr); const GenRegister stackptr = ra->genReg(selStatckPtr); + loadLaneID(stackptr); + // We compute the per-lane stack pointer here // private address start from zero p->push(); diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 0c301dd..25fdf08 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -176,11 +176,39 @@ namespace gbe p->pop(); } + void GenContext::loadLaneID(GenRegister dst) { + const GenRegister laneID = GenRegister::immv(0x76543210); + GenRegister dst_; + if (dst.type == GEN_TYPE_UW) + dst_ = dst; + else + dst_ = GenRegister::uw16grf(126,0); + + p->push(); + uint32_t execWidth = p->curr.execWidth; + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + if (execWidth == 8) + p->MOV(dst_, laneID); + else { + p->curr.execWidth = 8; + p->MOV(dst_, laneID); + //Packed Unsigned Half-Byte Integer Vector does not work + //have to mock by adding 8 to the singed vector + const GenRegister eight = GenRegister::immuw(8); + p->ADD(GenRegister::offset(dst_, 0, 16), dst_, eight); + p->curr.execWidth = 16; + } + if (dst.type != GEN_TYPE_UW) + p->MOV(dst, dst_); + p->pop(); + } + void GenContext::emitStackPointer(void) { using namespace ir; // Only emit stack pointer computation if we use a stack - if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0) + if (kernel->getStackSize() == 0) return; // Check that everything is consistent in the kernel code @@ -193,6 +221,8 @@ namespace gbe GenRegister::ud16grf(ir::ocl::stackptr); const GenRegister stackptr = ra->genReg(selStatckPtr); + loadLaneID(stackptr); + // We compute the per-lane stack pointer here // threadId * perThreadSize + laneId*perLaneSize // let private address start from zero @@ -2254,7 +2284,6 @@ namespace gbe INSERT_REG(numgroup0, GROUP_NUM_X) INSERT_REG(numgroup1, GROUP_NUM_Y) INSERT_REG(numgroup2, GROUP_NUM_Z) - INSERT_REG(stackptr, STACK_POINTER) INSERT_REG(printfbptr, PRINTF_BUF_POINTER) INSERT_REG(printfiptr, PRINTF_INDEX_POINTER) do {} while(0); diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp index 8ef725f..34f9293 100644 --- a/backend/src/backend/gen_context.hpp +++ b/backend/src/backend/gen_context.hpp @@ -107,6 +107,8 @@ namespace gbe return this->liveness->getLiveIn(bb); } + void loadLaneID(GenRegister dst); + void collectShifter(GenRegister dest, GenRegister src); void loadTopHalf(GenRegister dest, GenRegister src); void storeTopHalf(GenRegister dest, GenRegister src); diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp index 4cb88e9..36ee80a 100644 --- a/backend/src/backend/gen_reg_allocation.cpp +++ b/backend/src/backend/gen_reg_allocation.cpp @@ -133,8 +133,8 @@ namespace gbe void validateFlag(Selection &selection, SelectionInstruction &insn); /*! Allocate the GRF registers */ bool allocateGRFs(Selection &selection); - /*! Create gen registers for all preallocated curbe registers. */ - void allocatePayloadRegs(void); + /*! Create gen registers for all preallocated special registers. */ + void allocateSpecialRegs(void); /*! Create a Gen register from a register set in the payload */ void allocatePayloadReg(ir::Register, uint32_t offset, uint32_t subOffset = 0); /*! Create the intervals for each register */ @@ -228,7 +228,7 @@ namespace gbe this->intervals[reg].maxID = 0; } - INLINE void GenRegAllocator::Opaque::allocatePayloadRegs(void) { + INLINE void GenRegAllocator::Opaque::allocateSpecialRegs(void) { using namespace ir; for(auto &it : this->ctx.curbeRegs) allocatePayloadReg(it.first, it.second); @@ -248,6 +248,18 @@ namespace gbe allocatePayloadReg(reg, it->second, subOffset); ctx.splitBlock(it->second, subOffset); } + + if (RA.contains(ocl::stackbuffer)) { + uint32_t regSize = this->ctx.ra->getRegSize(ocl::stackptr); + uint32_t offset = this->ctx.registerAllocator->allocate(regSize, regSize, 1); + RA.insert(std::make_pair(ocl::stackptr, offset)); + } + + // Group and barrier IDs are always allocated by the hardware in r0 + RA.insert(std::make_pair(ocl::groupid0, 1*sizeof(float))); // r0.1 + RA.insert(std::make_pair(ocl::groupid1, 6*sizeof(float))); // r0.6 + RA.insert(std::make_pair(ocl::groupid2, 7*sizeof(float))); // r0.7 + RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2 } bool GenRegAllocator::Opaque::createGenReg(const Selection &selection, const GenRegInterval &interval) { @@ -1001,13 +1013,7 @@ namespace gbe this->intervals.push_back(ir::Register(regID)); // Allocate the special registers (only those which are actually used) - this->allocatePayloadRegs(); - - // Group and barrier IDs are always allocated by the hardware in r0 - RA.insert(std::make_pair(ocl::groupid0, 1*sizeof(float))); // r0.1 - RA.insert(std::make_pair(ocl::groupid1, 6*sizeof(float))); // r0.6 - RA.insert(std::make_pair(ocl::groupid2, 7*sizeof(float))); // r0.7 - RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2 + this->allocateSpecialRegs(); // block IP used to handle the mask in SW is always allocated diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h index fa75052..af19732 100644 --- a/backend/src/backend/program.h +++ b/backend/src/backend/program.h @@ -91,7 +91,6 @@ enum gbe_curbe_type { GBE_CURBE_GROUP_NUM_Z, GBE_CURBE_WORK_DIM, GBE_CURBE_IMAGE_INFO, - GBE_CURBE_STACK_POINTER, GBE_CURBE_PRINTF_BUF_POINTER, GBE_CURBE_PRINTF_INDEX_POINTER, GBE_CURBE_KERNEL_ARGUMENT, diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp index cff2463..efe192f 100644 --- a/backend/src/backend/program.hpp +++ b/backend/src/backend/program.hpp @@ -223,7 +223,7 @@ namespace gbe { uint32_t argNum; //!< Number of function arguments uint32_t curbeSize; //!< Size of the data to push uint32_t simdWidth; //!< SIMD size for the kernel (lane number) - uint32_t stackSize; //!< Stack size (may be 0 if unused) + uint32_t stackSize; //!< Stack size (0 if unused) uint32_t scratchSize; //!< Scratch memory size (may be 0 if unused) bool useSLM; //!< SLM requires a special HW config uint32_t slmSize; //!< slm size for kernel variable diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index 4adbd2b..0e60528 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -210,15 +210,6 @@ cl_curbe_fill(cl_kernel ker, UPLOAD(GBE_CURBE_WORK_DIM, work_dim); #undef UPLOAD - /* Write identity for the stack pointer. This is required by the stack pointer - * computation in the kernel - */ - if ((offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_STACK_POINTER, 0)) >= 0) { - const uint32_t simd_sz = interp_kernel_get_simd_width(ker->opaque); - uint32_t *stackptr = (uint32_t *) (ker->curbe + offset); - int32_t i; - for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i; - } /* Handle the various offsets to SLM */ const int32_t arg_n = interp_kernel_get_arg_num(ker->opaque); int32_t arg, slm_offset = interp_kernel_get_slm_size(ker->opaque); -- 1.9.1 _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet