If stacksize large 64KB, the formula of calculate the stackptr should change, form "threadId * perThreadSize + laneId*perLaneSize" to "(threadId * simdWidth + laneId)*perLaneSize", to avoid Dword * Dword.
Signed-off-by: Yang Rong <[email protected]> --- backend/src/backend/context.cpp | 2 +- backend/src/backend/gen75_context.cpp | 33 +++++++++++++-------------- backend/src/backend/gen_context.cpp | 42 +++++++++++++++-------------------- backend/src/backend/gen_context.hpp | 3 ++- 4 files changed, 36 insertions(+), 44 deletions(-) diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp index 5adeabc..0991786 100644 --- a/backend/src/backend/context.cpp +++ b/backend/src/backend/context.cpp @@ -398,7 +398,7 @@ namespace gbe uint32_t stackSize = 128; while (stackSize < fn.getStackSize()) { stackSize *= 3; - GBE_ASSERT(stackSize <= 64*KB); + //GBE_ASSERT(stackSize <= 64*KB); } this->kernel->stackSize = stackSize; } diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp index fa8b029..37063d7 100644 --- a/backend/src/backend/gen75_context.cpp +++ b/backend/src/backend/gen75_context.cpp @@ -66,37 +66,34 @@ namespace gbe // Check that everything is consistent in the kernel code const uint32_t perLaneSize = kernel->getStackSize(); - const uint32_t perThreadSize = perLaneSize * this->simdWidth; GBE_ASSERT(perLaneSize > 0); const GenRegister selStatckPtr = this->simdWidth == 8 ? GenRegister::ud8grf(ir::ocl::stackptr) : GenRegister::ud16grf(ir::ocl::stackptr); const GenRegister stackptr = ra->genReg(selStatckPtr); - - loadLaneID(stackptr); + // borrow block ip as temporary register as we will + // initialize block ip latter. + const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW); // We compute the per-lane stack pointer here - // private address start from zero + // threadId * perThreadSize + laneId*perLaneSize or + // (threadId * simdWidth + laneId)*perLaneSize p->push(); p->curr.execWidth = 1; p->curr.predicate = GEN_PREDICATE_NONE; //p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff)); - p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x7f)); - p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x180)); - p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7)); - p->curr.execWidth = this->simdWidth; - p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize)); //perLaneSize < 64K - p->curr.execWidth = 1; - p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(2)); - p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4)); - if(perThreadSize > 0xffff) { - p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perLaneSize)); - p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(this->simdWidth)); //Only support W * D, perLaneSize < 64K - } else - p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perThreadSize)); + p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immud(0x7f)); + p->AND(stackptr, GenRegister::ud1grf(0,5), GenRegister::immud(0x180)); + p->SHR(stackptr, stackptr, GenRegister::immud(7)); + p->SHL(tmpReg, tmpReg, GenRegister::immud(2)); + p->ADD(tmpReg, tmpReg, stackptr); //threadId + + p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); //threadId * simdWidth p->curr.execWidth = this->simdWidth; - p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0)); + loadLaneID(stackptr); + p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg); //threadId * simdWidth + laneId, must < 64K + p->MUL(stackptr, stackptr, GenRegister::immud(perLaneSize)); // (threadId * simdWidth + laneId)*perLaneSize p->pop(); } diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 0ea0dd0..99190d3 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -148,33 +148,33 @@ namespace gbe } /* Get proper block ip register according to current label width. */ - static GenRegister getBlockIP(GenContext &ctx) { + GenRegister GenContext::getBlockIP(void) { GenRegister blockip; - if (!ctx.isDWLabel()) - blockip = ctx.ra->genReg(GenRegister::uw8grf(ir::ocl::blockip)); + if (!isDWLabel()) + blockip = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip)); else - blockip = ctx.ra->genReg(GenRegister::ud8grf(ir::ocl::dwblockip)); + blockip = ra->genReg(GenRegister::ud8grf(ir::ocl::dwblockip)); return blockip; } /* Set current block ip register to a specified constant label value. */ - static void setBlockIP(GenContext &ctx, GenRegister blockip, uint32_t label) { - if (!ctx.isDWLabel()) - ctx.p->MOV(blockip, GenRegister::immuw(label)); + void GenContext::setBlockIP(GenRegister blockip, uint32_t label) { + if (!isDWLabel()) + p->MOV(blockip, GenRegister::immuw(label)); else - ctx.p->MOV(blockip, GenRegister::immud(label)); + p->MOV(blockip, GenRegister::immud(label)); } void GenContext::clearFlagRegister(void) { // when group size not aligned to simdWidth, flag register need clear to // make prediction(any8/16h) work correctly - const GenRegister blockip = getBlockIP(*this); + const GenRegister blockip = getBlockIP(); p->push(); p->curr.noMask = 1; p->curr.predicate = GEN_PREDICATE_NONE; - setBlockIP(*this, blockip, getMaxLabel()); + setBlockIP(blockip, getMaxLabel()); p->curr.noMask = 0; - setBlockIP(*this, blockip, 0); + setBlockIP(blockip, 0); p->curr.execWidth = 1; if (ra->isAllocated(ir::ocl::zero)) p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::zero)), GenRegister::immuw(0)); @@ -219,7 +219,6 @@ namespace gbe // Check that everything is consistent in the kernel code const uint32_t perLaneSize = kernel->getStackSize(); - const uint32_t perThreadSize = perLaneSize * this->simdWidth; GBE_ASSERT(perLaneSize > 0); const GenRegister selStatckPtr = this->simdWidth == 8 ? @@ -228,28 +227,23 @@ namespace gbe const GenRegister stackptr = ra->genReg(selStatckPtr); // borrow block ip as temporary register as we will // initialize block ip latter. - const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP(*this)), GEN_TYPE_UD); + const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW); loadLaneID(stackptr); // We compute the per-lane stack pointer here - // threadId * perThreadSize + laneId*perLaneSize + // threadId * perThreadSize + laneId*perLaneSize or + // (threadId * simdWidth + laneId)*perLaneSize // let private address start from zero //p->MOV(stackptr, GenRegister::immud(0)); p->push(); p->curr.execWidth = 1; p->curr.predicate = GEN_PREDICATE_NONE; - p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff)); + p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immuw(0x1ff)); //threadId + p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); //threadId * simdWidth p->curr.execWidth = this->simdWidth; - p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize)); //perLaneSize < 64K - p->curr.execWidth = 1; - if(perThreadSize > 0xffff) { - p->MUL(tmpReg, tmpReg, GenRegister::immuw(perLaneSize)); - p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); //Only support W * D, perLaneSize < 64K - } else - p->MUL(tmpReg, tmpReg, GenRegister::immuw(perThreadSize)); - p->curr.execWidth = this->simdWidth; - p->ADD(stackptr, stackptr, tmpReg); + p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg); //threadId * simdWidth + laneId, must < 64K + p->MUL(stackptr, stackptr, GenRegister::immud(perLaneSize)); // (threadId * simdWidth + laneId)*perLaneSize p->pop(); } diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp index 22ec0ea..25cce85 100644 --- a/backend/src/backend/gen_context.hpp +++ b/backend/src/backend/gen_context.hpp @@ -110,7 +110,8 @@ namespace gbe } void loadLaneID(GenRegister dst); - + GenRegister getBlockIP(void); + void setBlockIP(GenRegister blockip, uint32_t label); void collectShifter(GenRegister dest, GenRegister src); void loadTopHalf(GenRegister dest, GenRegister src); void storeTopHalf(GenRegister dest, GenRegister src); -- 2.1.4 _______________________________________________ Beignet mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/beignet
