The patchset LGTM. Thanks!
> -----Original Message----- > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of > Yang Rong > Sent: Wednesday, June 24, 2015 9:58 AM > To: beignet@lists.freedesktop.org > Cc: Yang, Rong R > Subject: [Beignet] [Patch V2 1/2] Use NP2 stack size to avoid cache line > conflict. > > The L3 cacheline size 64B, so calc the stack size from 64, and mul 3 per step. > Gen only support D * W before GEN8. So when calculate per lane stack > address, need take care of the mul. > > V2: calc the stack size from 128B, because long16 need 128B alignment. > Signed-off-by: Yang Rong <rong.r.y...@intel.com> > --- > backend/src/backend/context.cpp | 4 ++-- > backend/src/backend/gen75_context.cpp | 13 ++++++------- > backend/src/backend/gen_context.cpp | 13 ++++++------- > 3 files changed, 14 insertions(+), 16 deletions(-) > > diff --git a/backend/src/backend/context.cpp > b/backend/src/backend/context.cpp index 0dc60b7..b8dfa8c 100644 > --- a/backend/src/backend/context.cpp > +++ b/backend/src/backend/context.cpp > @@ -400,9 +400,9 @@ namespace gbe > return; > // Be sure that the stack pointer is set > // > GBE_ASSERT(this->kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, > 0) >= 0); > - uint32_t stackSize = 1*KB; > + uint32_t stackSize = 128; > while (stackSize < fn.getStackSize()) { > - stackSize <<= 1; > + stackSize *= 3; > GBE_ASSERT(stackSize <= 64*KB); > } > this->kernel->stackSize = stackSize; diff --git > a/backend/src/backend/gen75_context.cpp > b/backend/src/backend/gen75_context.cpp > index caf7043..b9dfb18 100644 > --- a/backend/src/backend/gen75_context.cpp > +++ b/backend/src/backend/gen75_context.cpp > @@ -74,12 +74,7 @@ namespace gbe > const uint32_t perLaneSize = kernel->getStackSize(); > const uint32_t perThreadSize = perLaneSize * this->simdWidth; > GBE_ASSERT(perLaneSize > 0); > - GBE_ASSERT(isPowerOf<2>(perLaneSize) == true); > - GBE_ASSERT(isPowerOf<2>(perThreadSize) == true); > > - // Use shifts rather than muls which are limited to 32x16 bit sources > - const uint32_t perLaneShift = logi2(perLaneSize); > - const uint32_t perThreadShift = logi2(perThreadSize); > const GenRegister selStatckPtr = this->simdWidth == 8 ? > GenRegister::ud8grf(ir::ocl::stackptr) : > GenRegister::ud16grf(ir::ocl::stackptr); > @@ -95,11 +90,15 @@ namespace gbe > p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), > GenRegister::immud(0x180)); > p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), > GenRegister::immud(7)); > p->curr.execWidth = this->simdWidth; > - p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift)); > + p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize)); > + //perLaneSize < 64K > p->curr.execWidth = 1; > p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), > GenRegister::immud(2)); > p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), > GenRegister::ud1grf(126, 4)); > - p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), > GenRegister::immud(perThreadShift)); > + if(perThreadSize > 0xffff) { > + p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), > GenRegister::immuw(perLaneSize)); > + p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), > GenRegister::immuw(this->simdWidth)); //Only support W * D, perLaneSize > < 64K > + } else > + p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), > + GenRegister::immuw(perThreadSize)); > p->curr.execWidth = this->simdWidth; > p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0)); > p->pop(); > diff --git a/backend/src/backend/gen_context.cpp > b/backend/src/backend/gen_context.cpp > index 43d14d2..db27377 100644 > --- a/backend/src/backend/gen_context.cpp > +++ b/backend/src/backend/gen_context.cpp > @@ -182,12 +182,7 @@ namespace gbe > const uint32_t perLaneSize = kernel->getStackSize(); > const uint32_t perThreadSize = perLaneSize * this->simdWidth; > GBE_ASSERT(perLaneSize > 0); > - GBE_ASSERT(isPowerOf<2>(perLaneSize) == true); > - GBE_ASSERT(isPowerOf<2>(perThreadSize) == true); > > - // Use shifts rather than muls which are limited to 32x16 bit sources > - const uint32_t perLaneShift = logi2(perLaneSize); > - const uint32_t perThreadShift = logi2(perThreadSize); > const GenRegister selStatckPtr = this->simdWidth == 8 ? > GenRegister::ud8grf(ir::ocl::stackptr) : > GenRegister::ud16grf(ir::ocl::stackptr); > @@ -201,9 +196,13 @@ namespace gbe > p->curr.predicate = GEN_PREDICATE_NONE; > p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), > GenRegister::immud(0x1ff)); > p->curr.execWidth = this->simdWidth; > - p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift)); > + p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize)); > + //perLaneSize < 64K > p->curr.execWidth = 1; > - p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), > GenRegister::immud(perThreadShift)); > + if(perThreadSize > 0xffff) { > + p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), > GenRegister::immuw(perLaneSize)); > + p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), > GenRegister::immuw(this->simdWidth)); //Only support W * D, perLaneSize > < 64K > + } else > + p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), > + GenRegister::immuw(perThreadSize)); > p->curr.execWidth = this->simdWidth; > p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0)); > p->pop(); > -- > 1.8.3.2 > > _______________________________________________ > Beignet mailing list > Beignet@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet