> -----Original Message----- > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of > junyan...@inbox.com > Sent: Tuesday, December 1, 2015 16:11 > To: beignet@lists.freedesktop.org > Subject: [Beignet] [PATCH 05/13] Backend: Establishing the thread/TID-EUID > map. > > From: Junyan He <junyan...@linux.intel.com> > > We need to use forward message to send data and sync threads within the > same work group. The HW lack the feature to get the TID and EUID of other > threads. So we need to establish a map for this usage. > > Signed-off-by: Junyan He <junyan...@linux.intel.com> > --- > backend/src/backend/gen_insn_selection.cpp | 128 > +++++++++++++++++++++++++++- > backend/src/llvm/llvm_gen_backend.cpp | 35 +++++++- > 2 files changed, 157 insertions(+), 6 deletions(-) > > diff --git a/backend/src/backend/gen_insn_selection.cpp > b/backend/src/backend/gen_insn_selection.cpp > index 884f89d..5b08958 100644 > --- a/backend/src/backend/gen_insn_selection.cpp > +++ b/backend/src/backend/gen_insn_selection.cpp > @@ -495,6 +495,8 @@ namespace gbe > uint32_t vectorNum; > /*! If true, generate code backward */ > bool bwdCodeGeneration; > + /*! If true, the thread map has already been stored */ > + bool storeThreadMap; > /*! To make function prototypes more readable */ > typedef const GenRegister &Reg; > > @@ -806,8 +808,9 @@ namespace gbe > ctx(ctx), block(NULL), > curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()), > maxInsnNum(ctx.getFunction().getLargestBlockSize()), > dagPool(maxInsnNum), > - stateNum(0), vectorNum(0), bwdCodeGeneration(false), > currAuxLabel(ctx.getFunction().labelNum()), > - bHas32X32Mul(false), bHasLongType(false), bHasDoubleType(false), > bHasHalfType(false), bLongRegRestrict(false), > + stateNum(0), vectorNum(0), bwdCodeGeneration(false), > storeThreadMap(false), > + currAuxLabel(ctx.getFunction().labelNum()), bHas32X32Mul(false), > bHasLongType(false), > + bHasDoubleType(false), bHasHalfType(false), > + bLongRegRestrict(false), > ldMsgOrder(LD_MSG_ORDER_IVB), slowByteGather(false) > { > const ir::Function &fn = ctx.getFunction(); @@ -5967,6 +5970,112 @@ > namespace gbe > /*! WorkGroup instruction pattern */ > DECL_PATTERN(WorkGroupInstruction) > { > + INLINE bool storeThreadID(Selection::Opaque &sel, uint32_t slmAddr) > const > + { > + using namespace ir; > + GenRegister sr0_0 = GenRegister::retype(GenRegister::sr(0), > GEN_TYPE_UW); > + const uint32_t simdWidth = sel.ctx.getSimdWidth(); > + GenRegister tmp; > + GenRegister addr; > + vector<GenRegister> fakeTemps; > + fakeTemps.push_back(GenRegister::null()); > + fakeTemps.push_back(GenRegister::null()); Need not push here. I think use empty vector is ok.
> + > + if (simdWidth == 16) { > + tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD), > ir::TYPE_U16), GEN_TYPE_UD); > + addr = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD), > ir::TYPE_U16), GEN_TYPE_UD); > + } else { > + tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD), > ir::TYPE_U32), GEN_TYPE_UD); > + addr = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD), > ir::TYPE_U32), GEN_TYPE_UD); > + } > + > + sr0_0.vstride = GEN_VERTICAL_STRIDE_0; > + sr0_0.hstride = GEN_HORIZONTAL_STRIDE_0; > + sr0_0.width = GEN_WIDTH_1; Use GenRegister::vec1 here? > + sel.push(); { > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > + sel.curr.execWidth = 8; > + > + sel.MOV(tmp, sr0_0); > + > + sel.MUL(addr, sel.selReg(ocl::threadid, ir::TYPE_U32), > GenRegister::immud(2)); > + sel.ADD(addr, addr, GenRegister::immud(slmAddr)); > + > + sel.push(); { > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > + sel.push(); { > + sel.curr.execWidth = 1; > + sel.MOV(GenRegister::flag(0, 1), GenRegister::immuw(0x01)); > + } sel.pop(); > + sel.curr.flag = 0; > + sel.curr.subFlag = 1; > + sel.curr.predicate = GEN_PREDICATE_NORMAL; > + sel.BYTE_SCATTER(addr, tmp, 1, GenRegister::immw(0xfe), > fakeTemps); > + } sel.pop(); > + } sel.pop(); > + return true; > + } > + > + INLINE GenRegister getNextThreadID(Selection::Opaque &sel, uint32_t > slmAddr) const > + { > + using namespace ir; > + const uint32_t simdWidth = sel.ctx.getSimdWidth(); > + GenRegister addr; > + GenRegister nextThread; > + GenRegister tid; > + vector<GenRegister> fakeTemps; > + fakeTemps.push_back(GenRegister::null()); > + fakeTemps.push_back(GenRegister::null()); Same as storeThreadID. > + > + if (simdWidth == 16) { > + addr = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD), > ir::TYPE_U16), GEN_TYPE_UD); > + nextThread = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD), > ir::TYPE_U16), GEN_TYPE_UD); > + tid = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD), > ir::TYPE_U16), GEN_TYPE_UD); > + } else { > + addr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32); > + nextThread = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32); > + tid = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32); > + } > + > + sel.push(); { > + sel.curr.execWidth = 8; > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > + sel.ADD(nextThread, sel.selReg(ocl::threadid, ir::TYPE_U32), > + GenRegister::immud(1)); > + > + /* Wrap the next thread id. */ > + sel.push(); { > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > + sel.curr.flag = 0; > + sel.curr.subFlag = 1; > + sel.CMP(GEN_CONDITIONAL_EQ, nextThread, sel.selReg(ocl::threadn, > ir::TYPE_U32), GenRegister::null()); > + sel.curr.predicate = GEN_PREDICATE_NORMAL; > + sel.MOV(nextThread, GenRegister::immud(0)); > + } sel.pop(); > + > + sel.MUL(addr, nextThread, GenRegister::immud(2)); > + sel.ADD(addr, addr, GenRegister::immud(slmAddr)); > + > + sel.push(); { > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > + sel.push(); { > + sel.curr.execWidth = 1; > + sel.MOV(GenRegister::flag(0, 1), GenRegister::immuw(0x010)); > + } sel.pop(); > + sel.curr.flag = 0; > + sel.curr.subFlag = 1; > + sel.curr.predicate = GEN_PREDICATE_NORMAL; > + sel.BYTE_GATHER(tid, addr, 1, GenRegister::immw(0xfe), fakeTemps); > + } sel.pop(); > + > + } sel.pop(); > + return tid; > + } > + > INLINE bool emitWGBroadcast(Selection::Opaque &sel, const > ir::WorkGroupInstruction &insn) const { > /* 1. BARRIER Ensure all the threads have set the correct value > for the > var which will be broadcasted. > 2. CMP IDs Compare the local IDs with the specified ones in the > function call. > @@ -6042,6 +6151,21 @@ namespace gbe > > if (workGroupOp == WORKGROUP_OP_BROADCAST) { > return emitWGBroadcast(sel, insn); > + } else if (workGroupOp >= WORKGROUP_OP_REDUCE_ADD && > workGroupOp <= WORKGROUP_OP_EXCLUSIVE_MAX) { > + const uint32_t slmAddr = insn.getSlmAddr(); > + /* First, we create the TheadID/localID map, in order to get > + which thread hold the next 16 workitems. */ > + > + if (!sel.storeThreadMap) { > + this->storeThreadID(sel, slmAddr); > + sel.storeThreadMap = true; > + } > + > + /* Then we insert a barrier to make sure all the var we are > interested in > + have been assigned the final value. */ > + sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), > + sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier); > + > + /* Third, get the next thread ID which we will Forward MSG to. */ > + GenRegister nextThreadID = getNextThreadID(sel, slmAddr); > } else { > GBE_ASSERT(0); > } > diff --git a/backend/src/llvm/llvm_gen_backend.cpp > b/backend/src/llvm/llvm_gen_backend.cpp > index 2137814..d50ed42 100644 > --- a/backend/src/llvm/llvm_gen_backend.cpp > +++ b/backend/src/llvm/llvm_gen_backend.cpp > @@ -3711,6 +3711,20 @@ namespace gbe > GBE_ASSERT(f.getwgBroadcastSLM() >= 0); > } > > + if (f.gettidMapSLM() < 0 && opcode >= > ir::WORKGROUP_OP_REDUCE_ADD && opcode <= > ir::WORKGROUP_OP_EXCLUSIVE_MAX) { > + /* Because we can not know the thread ID and the EUID for every > physical > + thead which the work items execute on before the run time. We need > to > + sync the thread execution order when using work group functions. We > + create the workitems/threadID map table in slm. > + When we come to here, the global thread local vars should have all > been > + allocated, so it's safe for us to steal a piece of SLM for this > usage. */ > + uint32_t mapSize = sizeof(uint16_t) * 64;// at most 64 thread for one > subslice. > + f.setUseSLM(true); > + uint32_t oldSlm = f.getSLMSize(); > + f.setSLMSize(oldSlm + mapSize); > + f.settidMapSLM(oldSlm); > + GBE_ASSERT(f.gettidMapSLM() >= 0); > + } > > CallSite::arg_iterator AI = CS.arg_begin(); > CallSite::arg_iterator AE = CS.arg_end(); @@ -3731,10 +3745,23 @@ > namespace gbe > ctx.WORKGROUP(ir::WORKGROUP_OP_BROADCAST, > (uint32_t)f.getwgBroadcastSLM(), getRegister(&I), srcTuple, argNum, > getType(ctx, (*CS.arg_begin())->getType())); > } else { > - const ir::Register src = this->getRegister(*(AI++)); > - const ir::Tuple srcTuple = ctx.arrayTuple(&src, 1); > - ctx.WORKGROUP(opcode, (uint32_t)0, getRegister(&I), srcTuple, 1, > - getType(ctx, (*CS.arg_begin())->getType())); > + ConstantInt *sign = dyn_cast<ConstantInt>(AI); > + GBE_ASSERT(sign); > + bool isSign = sign->getZExtValue(); > + AI++; > + ir::Type ty; > + if (isSign) { > + ty = getType(ctx, (*AI)->getType()); > + } else { > + ty = getUnsignedType(ctx, (*AI)->getType()); > + } > + > + ir::Register src[3]; > + src[0] = ir::ocl::threadn; > + src[1] = ir::ocl::threadid; > + src[2] = this->getRegister(*(AI++)); > + const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], 3); > + ctx.WORKGROUP(opcode, (uint32_t)f.gettidMapSLM(), > + getRegister(&I), srcTuple, 3, ty); > } > > GBE_ASSERT(AI == AE); > -- > 1.7.9.5 > > > > _______________________________________________ > Beignet mailing list > Beignet@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet