From: Junyan He <[email protected]> We will use forward message and n0.2 notification to sync all threads.
Signed-off-by: Junyan He <[email protected]> --- backend/src/backend/gen_context.cpp | 334 +++++++++++++++++++++++++++++++++++ backend/src/backend/gen_context.hpp | 1 + 2 files changed, 335 insertions(+) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index a9663d7..97c81f4 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -1680,6 +1680,340 @@ namespace gbe p->ATOMIC(dst, function, src, bti, insn.srcNum); } + static void workgroupOpBetweenThread(GenRegister msgData, GenRegister theVal, GenRegister threadData, + uint32_t simd, uint32_t wg_op, GenEncoder *p) { + p->push(); + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->curr.execWidth = 1; + + if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN) { + p->SEL_CMP(GEN_CONDITIONAL_LE, msgData, threadData, msgData); + } + + p->pop(); + } + + static void workgroupOpInThread(GenRegister msgData, GenRegister theVal, GenRegister threadData, + uint32_t simd, uint32_t wg_op, GenEncoder *p) { + p->push(); + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->curr.execWidth = 1; + + /* Setting the init value here. */ + if (wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || wg_op == ir::WORKGROUP_OP_REDUCE_MIN) { + GenRegister::retype(threadData, theVal.type); + if (theVal.type == GEN_TYPE_UD) { + p->MOV(threadData, GenRegister::immud(0xFFFFFFFF)); + } + } + + if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN) { + // TODO: theVal may be scalar. + GenRegister v = theVal; + v.vstride = GEN_VERTICAL_STRIDE_0; + v.width = GEN_WIDTH_1; + v.hstride = GEN_HORIZONTAL_STRIDE_0; + for (uint32_t i = 0; i < simd; i++) { + p->SEL_CMP(GEN_CONDITIONAL_LE, threadData, threadData, v); + v.subnr += typeSize(theVal.type); + if (v.subnr == 32) { + v.subnr = 0; + v.nr++; + } + } + } + + p->MOV(msgData, threadData); + p->pop(); + } + + void GenContext::emitWorkGroupOpInstruction(const SelectionInstruction &insn) { + const GenRegister dst = ra->genReg(insn.dst(0)); + GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag); + GenRegister nextThreadID = ra->genReg(insn.src(1)); + const GenRegister theVal = ra->genReg(insn.src(0)); + GenRegister threadid = ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)); + GenRegister msgData = GenRegister::retype(nextThreadID, dst.type); // The data forward. + msgData.vstride = GEN_VERTICAL_STRIDE_0; + msgData.width = GEN_WIDTH_1; + msgData.hstride = GEN_HORIZONTAL_STRIDE_0; + GenRegister threadData = + GenRegister::retype(GenRegister::offset(nextThreadID, 0, 24), dst.type); // Res within thread. + threadData.vstride = GEN_VERTICAL_STRIDE_0; + threadData.width = GEN_WIDTH_1; + threadData.hstride = GEN_HORIZONTAL_STRIDE_0; + uint32_t wg_op = insn.extra.workgroupOp; + uint32_t simd = p->curr.execWidth; + GenRegister flag_save = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 8), GEN_TYPE_UW); + flag_save.vstride = GEN_VERTICAL_STRIDE_0; + flag_save.width = GEN_WIDTH_1; + flag_save.hstride = GEN_HORIZONTAL_STRIDE_0; + + p->push(); { /* First, so something within thread. */ + p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); + /* Do some calculation within each thread. */ + workgroupOpInThread(msgData, theVal, threadData, simd, wg_op, p); + } p->pop(); + + p->push(); { /* We begin from threadid 0. */ + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->curr.execWidth = 1; + p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); + p->CMP(GEN_CONDITIONAL_EQ, threadid, GenRegister::immud(0x0)); + + p->curr.predicate = GEN_PREDICATE_NORMAL; + p->curr.inversePredicate = 1; + p->MOV(flag_save, GenRegister::immuw(0x0)); + p->curr.inversePredicate = 0; + p->MOV(flag_save, GenRegister::immuw(0xffff)); + + p->curr.predicate = GEN_PREDICATE_NONE; + p->MOV(flagReg, flag_save); + } p->pop(); + + p->push(); { + p->curr.predicate = GEN_PREDICATE_NORMAL; + p->curr.noMask = 1; + p->curr.execWidth = 1; + p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); + p->curr.inversePredicate = 1; + p->IF(GenRegister::immuw(6)); /* Not the first thread, wait for msg first. */ + p->WAIT(2); + p->curr.predicate = GEN_PREDICATE_NONE; + p->ENDIF(GenRegister::immuw(2)); + /* Do something when get the msg. */ + workgroupOpBetweenThread(msgData, theVal, threadData, simd, wg_op, p); + + /* Restore the flag. */ + p->curr.predicate = GEN_PREDICATE_NONE; + p->MOV(flagReg, flag_save); + } p->pop(); + + p->push(); { /* then send msg. */ + p->curr.noMask = 1; + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.execWidth = 1; + GenRegister offLen = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 20), GEN_TYPE_UD); + offLen.vstride = GEN_VERTICAL_STRIDE_0; + offLen.width = GEN_WIDTH_1; + offLen.hstride = GEN_HORIZONTAL_STRIDE_0; + uint32_t szEnc = typeSize(theVal.type) >> 1; + if (szEnc == 4) { + szEnc = 3; + } + p->MOV(offLen, GenRegister::immud((szEnc << 8) | (nextThreadID.nr << 21))); + + GenRegister tidEuid = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 16), GEN_TYPE_UD); + tidEuid.vstride = GEN_VERTICAL_STRIDE_0; + tidEuid.width = GEN_WIDTH_1; + tidEuid.hstride = GEN_HORIZONTAL_STRIDE_0; + p->SHL(tidEuid, tidEuid, GenRegister::immud(16)); + + p->curr.execWidth = 8; + p->FWD_GATEWAY_MSG(nextThreadID, 2); + } p->pop(); + + p->push(); { /* If we are first thread, wait last one to notify us. */ + p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); + p->curr.execWidth = 1; + p->curr.predicate = GEN_PREDICATE_NORMAL; + p->IF(GenRegister::immuw(6)); + p->WAIT(2); + p->curr.predicate = GEN_PREDICATE_NONE; + p->ENDIF(GenRegister::immuw(2)); + } p->pop(); + + /* Broadcast the result. */ + if (wg_op == ir::WORKGROUP_OP_REDUCE_MIN) { + p->push(); { + p->curr.predicate = GEN_PREDICATE_NORMAL; + p->curr.noMask = 1; + p->curr.execWidth = 1; + p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); + p->curr.inversePredicate = 1; + p->IF(GenRegister::immuw(6)); /* Not the first thread, wait for msg first. */ + p->WAIT(2); + p->curr.predicate = GEN_PREDICATE_NONE; + p->ENDIF(GenRegister::immuw(2)); + /* Do something when get the msg. */ + p->curr.execWidth = simd; + p->MOV(dst, msgData); + + p->curr.execWidth = 8; + p->FWD_GATEWAY_MSG(nextThreadID, 2); + + p->curr.execWidth = 1; + p->curr.inversePredicate = 0; + p->curr.predicate = GEN_PREDICATE_NORMAL; + p->IF(GenRegister::immuw(6)); + p->WAIT(2); + p->curr.predicate = GEN_PREDICATE_NONE; + p->ENDIF(GenRegister::immuw(2)); + } p->pop(); + } + } + +#if 0 + void GenContext::emitWGBroadcastInstruction(const SelectionInstruction &insn) { + GenRegister dimX, dimY, dimZ; + GenRegister lid0, lid1, lid2; + int dim = insn.srcNum - 2; + if (p->curr.execWidth == 16) { + lid0 = ra->genReg(GenRegister::ud16grf(ir::ocl::lid0)); + lid1 = ra->genReg(GenRegister::ud16grf(ir::ocl::lid1)); + lid2 = ra->genReg(GenRegister::ud16grf(ir::ocl::lid2)); + } else { + lid0 = ra->genReg(GenRegister::ud8grf(ir::ocl::lid0)); + lid1 = ra->genReg(GenRegister::ud8grf(ir::ocl::lid1)); + lid2 = ra->genReg(GenRegister::ud8grf(ir::ocl::lid2)); + } + + p->push(); { /* First, is the specified LocalID belong to this thread ? */ + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); + dimX = ra->genReg(insn.src(1)); + p->CMP(GEN_CONDITIONAL_EQ, dimX, lid0); + p->curr.predicate = GEN_PREDICATE_NORMAL; + if (dim >= 2) { + dimY = ra->genReg(insn.src(2)); + p->CMP(GEN_CONDITIONAL_EQ, dimY, lid1); + } + if (dim == 3) { + dimZ = ra->genReg(insn.src(3)); + p->CMP(GEN_CONDITIONAL_EQ, dimY, lid2); + } + } p->pop(); + + GenRegister res = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 8), GEN_TYPE_UW); + GenRegister fbl = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 4), GEN_TYPE_UD); + p->push(); { + res.vstride = GEN_VERTICAL_STRIDE_0; + res.width = GEN_WIDTH_1; + res.hstride = GEN_HORIZONTAL_STRIDE_0; + fbl.vstride = GEN_VERTICAL_STRIDE_0; + fbl.width = GEN_WIDTH_1; + fbl.hstride = GEN_HORIZONTAL_STRIDE_0; + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->curr.execWidth = 1; + p->MOV(res, flagReg); + p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); + /* if (x == get_local_id(0) && y == get_local_id(1) && z == get_local_id(2)) in this thread. */ + p->CMP(GEN_CONDITIONAL_NEQ, res, GenRegister::immuw(0)); + p->curr.predicate = GEN_PREDICATE_NORMAL; + p->curr.inversePredicate = 1; + p->MOV(GenRegister::retype(fbl, GEN_TYPE_UW), GenRegister::immuw(0x0)); + p->curr.inversePredicate = 0; + p->MOV(GenRegister::retype(fbl, GEN_TYPE_UW), GenRegister::immuw(0xffff)); + + p->curr.predicate = GEN_PREDICATE_NONE; + p->MOV(flagReg, GenRegister::retype(fbl, GEN_TYPE_UW)); + + p->curr.predicate = GEN_PREDICATE_NORMAL; + p->curr.inversePredicate = 1; + p->IF(GenRegister::immuw(6)); + p->WAIT(2); + p->curr.predicate = GEN_PREDICATE_NONE; + p->ENDIF(GenRegister::immuw(2)); + } p->pop(); + + + p->push(); { /* Fill all the workitems in the same thread with the value. */ + p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); + p->curr.noMask = 1; + p->curr.predicate = GEN_PREDICATE_NORMAL; + p->curr.execWidth = 1; + p->MOV(GenRegister::addr1(0), GenRegister::immuw(theVal.nr*32 + theVal.subnr)); + if (theVal.hstride != GEN_HORIZONTAL_STRIDE_0) { + p->FBL(fbl, res); + p->MUL(fbl, fbl, GenRegister::immud(typeSize(theVal.type))); + p->ADD(GenRegister::addr1(0), GenRegister::addr1(0), GenRegister::retype(fbl, GEN_TYPE_UW)); + } + + GenRegister v = GenRegister::to_indirectNx1(theVal, theVal.nr*32 + theVal.subnr, 0); + v.vstride = GEN_VERTICAL_STRIDE_0; + v.width = GEN_WIDTH_1; + v.hstride = GEN_HORIZONTAL_STRIDE_0; + if (dst.hstride == GEN_HORIZONTAL_STRIDE_0) { + p->push(); + p->curr.execWidth = 1; + p->MOV(dst, v); + p->pop(); + } else { + if (simd == 16) { + p->push(); + p->curr.execWidth = 8; + p->MOV(dst, v); + p->curr.quarterControl = GEN_COMPRESSION_Q2; + p->MOV(GenRegister::Qn(dst, 1), v); + p->pop(); + } else { + p->push(); + p->curr.execWidth = 8; + p->MOV(dst, v); + p->pop(); + } + } + + GenRegister data = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 0), GEN_TYPE_UD); + data.vstride = GEN_VERTICAL_STRIDE_0; + data.width = GEN_WIDTH_1; + data.hstride = GEN_HORIZONTAL_STRIDE_0; + p->MOV(data, v); + } p->pop(); + + p->push(); { /* Fill all the workitems with the value from forward MSG. */ + p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); + p->curr.inversePredicate = 1; + p->curr.noMask = 1; + p->curr.predicate = GEN_PREDICATE_NORMAL; + GenRegister data = GenRegister::retype(nextThreadID, dst.type); + data.vstride = GEN_VERTICAL_STRIDE_0; + data.width = GEN_WIDTH_1; + data.hstride = GEN_HORIZONTAL_STRIDE_0; + p->MOV(dst, data); + } p->pop(); + + p->push(); {/* Then we forward the value to the other threads. */ + p->curr.noMask = 1; + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.execWidth = 1; + GenRegister offLen = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 20), GEN_TYPE_UD); + offLen.vstride = GEN_VERTICAL_STRIDE_0; + offLen.width = GEN_WIDTH_1; + offLen.hstride = GEN_HORIZONTAL_STRIDE_0; + uint32_t szEnc = typeSize(theVal.type) >> 1; + if (szEnc == 4) { + szEnc = 3; + } + p->MOV(offLen, GenRegister::immud((szEnc << 8) | (nextThreadID.nr << 21))); + + GenRegister tidEuid = GenRegister::retype(GenRegister::offset(nextThreadID, 0, 16), GEN_TYPE_UD); + tidEuid.vstride = GEN_VERTICAL_STRIDE_0; + tidEuid.width = GEN_WIDTH_1; + tidEuid.hstride = GEN_HORIZONTAL_STRIDE_0; + p->SHL(tidEuid, tidEuid, GenRegister::immud(16)); + + p->curr.execWidth = 8; + p->FWD_GATEWAY_MSG(nextThreadID, 2); + } p->pop(); + + p->push(); { + /* If we are first thread, wait last one to notify us. */ + p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); + p->curr.predicate = GEN_PREDICATE_NORMAL; + p->IF(GenRegister::immuw(6)); + p->WAIT(2); + p->curr.predicate = GEN_PREDICATE_NONE; + p->ENDIF(GenRegister::immuw(2)); + } p->pop(); + } +#endif + void GenContext::emitIndirectMoveInstruction(const SelectionInstruction &insn) { GenRegister src = ra->genReg(insn.src(0)); if(sel->isScalarReg(src.reg())) diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp index 6ca88db..95d336e 100644 --- a/backend/src/backend/gen_context.hpp +++ b/backend/src/backend/gen_context.hpp @@ -167,6 +167,7 @@ namespace gbe void emitGetImageInfoInstruction(const SelectionInstruction &insn); virtual void emitI64MULInstruction(const SelectionInstruction &insn); virtual void emitI64DIVREMInstruction(const SelectionInstruction &insn); + void emitWorkGroupOpInstruction(const SelectionInstruction &insn); void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode); void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode); -- 1.7.9.5 _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
