The L3 cacheline size 64B, so calc the stack size from 64, and mul 3 per step. Gen only support D * W before GEN8. So when calculate per lane stack address, need take care of the mul.
V2: calc the stack size from 128B, because long16 need 128B alignment. Signed-off-by: Yang Rong <rong.r.y...@intel.com> --- backend/src/backend/context.cpp | 4 ++-- backend/src/backend/gen75_context.cpp | 13 ++++++------- backend/src/backend/gen_context.cpp | 13 ++++++------- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp index 0dc60b7..b8dfa8c 100644 --- a/backend/src/backend/context.cpp +++ b/backend/src/backend/context.cpp @@ -400,9 +400,9 @@ namespace gbe return; // Be sure that the stack pointer is set // GBE_ASSERT(this->kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0); - uint32_t stackSize = 1*KB; + uint32_t stackSize = 128; while (stackSize < fn.getStackSize()) { - stackSize <<= 1; + stackSize *= 3; GBE_ASSERT(stackSize <= 64*KB); } this->kernel->stackSize = stackSize; diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp index caf7043..b9dfb18 100644 --- a/backend/src/backend/gen75_context.cpp +++ b/backend/src/backend/gen75_context.cpp @@ -74,12 +74,7 @@ namespace gbe const uint32_t perLaneSize = kernel->getStackSize(); const uint32_t perThreadSize = perLaneSize * this->simdWidth; GBE_ASSERT(perLaneSize > 0); - GBE_ASSERT(isPowerOf<2>(perLaneSize) == true); - GBE_ASSERT(isPowerOf<2>(perThreadSize) == true); - // Use shifts rather than muls which are limited to 32x16 bit sources - const uint32_t perLaneShift = logi2(perLaneSize); - const uint32_t perThreadShift = logi2(perThreadSize); const GenRegister selStatckPtr = this->simdWidth == 8 ? GenRegister::ud8grf(ir::ocl::stackptr) : GenRegister::ud16grf(ir::ocl::stackptr); @@ -95,11 +90,15 @@ namespace gbe p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x180)); p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7)); p->curr.execWidth = this->simdWidth; - p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift)); + p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize)); //perLaneSize < 64K p->curr.execWidth = 1; p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(2)); p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4)); - p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift)); + if(perThreadSize > 0xffff) { + p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perLaneSize)); + p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(this->simdWidth)); //Only support W * D, perLaneSize < 64K + } else + p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perThreadSize)); p->curr.execWidth = this->simdWidth; p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0)); p->pop(); diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 43d14d2..db27377 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -182,12 +182,7 @@ namespace gbe const uint32_t perLaneSize = kernel->getStackSize(); const uint32_t perThreadSize = perLaneSize * this->simdWidth; GBE_ASSERT(perLaneSize > 0); - GBE_ASSERT(isPowerOf<2>(perLaneSize) == true); - GBE_ASSERT(isPowerOf<2>(perThreadSize) == true); - // Use shifts rather than muls which are limited to 32x16 bit sources - const uint32_t perLaneShift = logi2(perLaneSize); - const uint32_t perThreadShift = logi2(perThreadSize); const GenRegister selStatckPtr = this->simdWidth == 8 ? GenRegister::ud8grf(ir::ocl::stackptr) : GenRegister::ud16grf(ir::ocl::stackptr); @@ -201,9 +196,13 @@ namespace gbe p->curr.predicate = GEN_PREDICATE_NONE; p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff)); p->curr.execWidth = this->simdWidth; - p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift)); + p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize)); //perLaneSize < 64K p->curr.execWidth = 1; - p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift)); + if(perThreadSize > 0xffff) { + p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perLaneSize)); + p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(this->simdWidth)); //Only support W * D, perLaneSize < 64K + } else + p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perThreadSize)); p->curr.execWidth = this->simdWidth; p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0)); p->pop(); -- 1.8.3.2 _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet