> -----Original Message----- > From: Beignet [mailto:[email protected]] On Behalf Of > Yang Rong > Sent: Sunday, February 14, 2016 2:42 PM > To: [email protected] > Cc: Yang, Rong R <[email protected]> > Subject: [Beignet] [PATCH] GBE: remove stacksize 64KB limitation. > > If stacksize large 64KB, the formula of calculate the stackptr should > change, form "threadId * perThreadSize + laneId*perLaneSize" to > "(threadId * simdWidth + laneId)*perLaneSize", to avoid Dword * Dword. > > Signed-off-by: Yang Rong <[email protected]> > --- > backend/src/backend/context.cpp | 2 +- > backend/src/backend/gen75_context.cpp | 33 +++++++++++++-------------- > backend/src/backend/gen_context.cpp | 42 +++++++++++++++------------------- > - > backend/src/backend/gen_context.hpp | 3 ++- > 4 files changed, 36 insertions(+), 44 deletions(-) > > diff --git a/backend/src/backend/context.cpp > b/backend/src/backend/context.cpp > index 5adeabc..0991786 100644 > --- a/backend/src/backend/context.cpp > +++ b/backend/src/backend/context.cpp > @@ -398,7 +398,7 @@ namespace gbe > uint32_t stackSize = 128; > while (stackSize < fn.getStackSize()) { > stackSize *= 3; > - GBE_ASSERT(stackSize <= 64*KB); > + //GBE_ASSERT(stackSize <= 64*KB); > } > this->kernel->stackSize = stackSize; > } > diff --git a/backend/src/backend/gen75_context.cpp > b/backend/src/backend/gen75_context.cpp > index fa8b029..37063d7 100644 > --- a/backend/src/backend/gen75_context.cpp > +++ b/backend/src/backend/gen75_context.cpp > @@ -66,37 +66,34 @@ namespace gbe > > // Check that everything is consistent in the kernel code > const uint32_t perLaneSize = kernel->getStackSize(); > - const uint32_t perThreadSize = perLaneSize * this->simdWidth; > GBE_ASSERT(perLaneSize > 0); > > const GenRegister selStatckPtr = this->simdWidth == 8 ? > GenRegister::ud8grf(ir::ocl::stackptr) : > GenRegister::ud16grf(ir::ocl::stackptr); > const GenRegister stackptr = ra->genReg(selStatckPtr); > - > - loadLaneID(stackptr); > + // borrow block ip as temporary register as we will > + // initialize block ip latter. > + const GenRegister tmpReg = > GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW); > > // We compute the per-lane stack pointer here > - // private address start from zero > + // threadId * perThreadSize + laneId*perLaneSize or > + // (threadId * simdWidth + laneId)*perLaneSize > p->push(); > p->curr.execWidth = 1; > p->curr.predicate = GEN_PREDICATE_NONE; > //p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), > GenRegister::immud(0x1ff)); > - p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), > GenRegister::immud(0x7f)); > - p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), > GenRegister::immud(0x180)); > - p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), > GenRegister::immud(7)); > - p->curr.execWidth = this->simdWidth; > - p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize)); > //perLaneSize < 64K > - p->curr.execWidth = 1; > - p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), > GenRegister::immud(2)); > - p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), > GenRegister::ud1grf(126, 4)); > - if(perThreadSize > 0xffff) { > - p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), > GenRegister::immuw(perLaneSize)); > - p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), > GenRegister::immuw(this->simdWidth)); //Only support W * D, perLaneSize < > 64K > - } else > - p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), > GenRegister::immuw(perThreadSize)); > + p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immud(0x7f)); > + p->AND(stackptr, GenRegister::ud1grf(0,5), GenRegister::immud(0x180)); > + p->SHR(stackptr, stackptr, GenRegister::immud(7)); > + p->SHL(tmpReg, tmpReg, GenRegister::immud(2)); > + p->ADD(tmpReg, tmpReg, stackptr); //threadId > + > + p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); > //threadId * simdWidth > p->curr.execWidth = this->simdWidth; > - p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0)); > + loadLaneID(stackptr); > + p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg); > //threadId * simdWidth + laneId, must < 64K > + p->MUL(stackptr, stackptr, GenRegister::immud(perLaneSize)); // > (threadId > * simdWidth + laneId)*perLaneSize According to Hardware Spec: For IVB and HSW, When both src0 and src1 are of type D or UD, only the low 16 bits of each element of src1 are used. The accumulator maintains full 48-bit precision. So looks like you should place (threadId * simdWidth + laneId) at src1. Have you ever do some try on IVB or HSW?
Thanks! Ruiling _______________________________________________ Beignet mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/beignet
