LGTM, thanks. -----Original Message----- From: [email protected] [mailto:[email protected]] On Behalf Of Zhigang Gong Sent: Friday, January 17, 2014 10:49 AM To: [email protected] Cc: Gong, Zhigang Subject: [Beignet] [PATCH] GBE: fixed the stack allocation.
Yongjia wrote a case hit the previous 1KB limitation. I took a look at the stack pointer related code then I found the implementation is not comply with the OCL spec. According to OpenCL spec, section 6.9: d. Variable length arrays and structures with flexible (or unsized) arrays are not supported. Thus all the local variable size should be constant, and we can manipulate the stack pointer easier , no need to do the alignment calculating at runtime, and could get the eaxct stack size then allocate stack size on demand. I still put a limitation there which is 64KB. Signed-off-by: Zhigang Gong <[email protected]> --- backend/src/backend/context.cpp | 7 +++- backend/src/ir/function.cpp | 2 +- backend/src/ir/function.hpp | 5 +++ backend/src/llvm/llvm_gen_backend.cpp | 61 +++++++++++---------------------- 4 files changed, 32 insertions(+), 43 deletions(-) diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp index 7a80dc8..2125bd1 100644 --- a/backend/src/backend/context.cpp +++ b/backend/src/backend/context.cpp @@ -389,7 +389,12 @@ namespace gbe return; // Be sure that the stack pointer is set GBE_ASSERT(this->kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0); - this->kernel->stackSize = 1*KB; // XXX compute that in a better way + uint32_t stackSize = 1*KB; + while (stackSize < fn.getStackSize()) { + stackSize <<= 1; + GBE_ASSERT(stackSize <= 64*KB); + } + this->kernel->stackSize = stackSize; } uint32_t Context::newCurbeEntry(gbe_curbe_type value, diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp index 4273f66..71dcc1f 100644 --- a/backend/src/ir/function.cpp +++ b/backend/src/ir/function.cpp @@ -43,7 +43,7 @@ namespace ir { /////////////////////////////////////////////////////////////////////////// Function::Function(const std::string &name, const Unit &unit, Profile profile) : - name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false), slmSize(0) + name(name), unit(unit), profile(profile), simdWidth(0), + useSLM(false), slmSize(0), stackSize(0) { initProfile(*this); samplerSet = GBE_NEW(SamplerSet); diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp index 11e268b..2468e73 100644 --- a/backend/src/ir/function.hpp +++ b/backend/src/ir/function.hpp @@ -314,6 +314,10 @@ namespace ir { void setCompileWorkGroupSize(size_t x, size_t y, size_t z) { compileWgSize[0] = x; compileWgSize[1] = y; compileWgSize[2] = z; } /*! Get required work group size. */ const size_t *getCompileWorkGroupSize(void) const {return compileWgSize;} + /*! Get stack size. */ + INLINE const uint32_t getStackSize(void) const { return this->stackSize; } + /*! Push stack size. */ + INLINE void pushStackSize(uint32_t step) { this->stackSize += step; + } private: friend class Context; //!< Can freely modify a function std::string name; //!< Function name @@ -330,6 +334,7 @@ namespace ir { uint32_t simdWidth; //!< 8 or 16 if forced, 0 otherwise bool useSLM; //!< Is SLM required? uint32_t slmSize; //!< local variable size inside kernel function + uint32_t stackSize; //!< stack size for private memory. SamplerSet *samplerSet; //!< samplers used in this function. ImageSet* imageSet; //!< Image set in this function's arguments.. size_t compileWgSize[3]; //!< required work group size specified by diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index b0555aa..ebead77 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -2757,32 +2757,23 @@ namespace gbe Value *src = I.getOperand(0); Type *elemType = I.getType()->getElementType(); ir::ImmediateIndex immIndex; - bool needMultiply = true; + uint32_t elementSize = getTypeByteSize(unit, elemType); // Be aware, we manipulate pointers if (ctx.getPointerSize() == ir::POINTER_32_BITS) - immIndex = ctx.newImmediate(uint32_t(getTypeByteSize(unit, elemType))); + immIndex = ctx.newImmediate(uint32_t(elementSize)); else - immIndex = ctx.newImmediate(uint64_t(getTypeByteSize(unit, elemType))); + immIndex = ctx.newImmediate(uint64_t(elementSize)); // OK, we try to see if we know compile time the size we need to allocate - if (I.isArrayAllocation() == false) // one element allocated only - needMultiply = false; - else { + if (I.isArrayAllocation() == true) { Constant *CPV = dyn_cast<Constant>(src); - if (CPV) { - const uint64_t elemNum = processConstant<uint64_t>(CPV, U64CPVExtractFunctor(ctx)); - ir::Immediate imm = ctx.getImmediate(immIndex); - imm.data.u64 = ALIGN(imm.data.u64 * elemNum, 4); - ctx.setImmediate(immIndex, imm); - needMultiply = false; - } else { - // Brutal but cheap way to get arrays aligned on 4 bytes: we just align - // the element on 4 bytes! - ir::Immediate imm = ctx.getImmediate(immIndex); - imm.data.u64 = ALIGN(imm.data.u64, 4); - ctx.setImmediate(immIndex, imm); - } + GBE_ASSERT(CPV); + const uint64_t elemNum = processConstant<uint64_t>(CPV, U64CPVExtractFunctor(ctx)); + ir::Immediate imm = ctx.getImmediate(immIndex); + imm.data.u64 = ALIGN(imm.data.u64 * elemNum, 4); + elementSize *= elemNum; + ctx.setImmediate(immIndex, imm); } // Now emit the stream of instructions to get the allocated pointer @@ -2797,32 +2788,20 @@ namespace gbe // align the stack pointer according to data alignment if(align > 1) { - // (ptr + (align-1)) & ~(align-1) - ir::ImmediateIndex immAlign; - immAlign = ctx.newIntegerImmediate(align-1, ir::TYPE_U32); - ir::Register alignReg = ctx.reg(ctx.getPointerFamily()); - ctx.LOADI(ir::TYPE_S32, alignReg, immAlign); - ctx.ADD(ir::TYPE_U32, stack, stack, alignReg); - - alignReg = ctx.reg(ctx.getPointerFamily()); - immAlign = ctx.newIntegerImmediate(~(align-1), ir::TYPE_U32); - ctx.LOADI(ir::TYPE_S32, alignReg, immAlign); - ctx.AND(ir::TYPE_U32, stack, stack, alignReg); + uint32_t prevStackPtr = ctx.getFunction().getStackSize(); + uint32_t step = ((prevStackPtr + (align - 1)) & ~(align - 1)) - prevStackPtr; + ir::ImmediateIndex stepImm = ctx.newIntegerImmediate(step, ir::TYPE_U32); + ir::Register stepReg = ctx.reg(ctx.getPointerFamily()); + ctx.LOADI(ir::TYPE_S32, stepReg, stepImm); + ctx.ADD(ir::TYPE_U32, stack, stack, stepReg); + ctx.getFunction().pushStackSize(step); } // Set the destination register properly ctx.MOV(imm.type, dst, stack); - // Easy case, we just increment the stack pointer - if (needMultiply == false) { - ctx.LOADI(imm.type, reg, immIndex); - ctx.ADD(imm.type, stack, stack, reg); - } - // Harder case (variable length array) that requires a multiply - else { - ctx.LOADI(imm.type, reg, immIndex); - ctx.MUL(imm.type, reg, this->getRegister(src), reg); - ctx.ADD(imm.type, stack, stack, reg); - } + ctx.LOADI(imm.type, reg, immIndex); + ctx.ADD(imm.type, stack, stack, reg); + ctx.getFunction().pushStackSize(elementSize); } static INLINE Value *getLoadOrStoreValue(LoadInst &I) { -- 1.7.9.5 _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
