As the GPU is running under predication control, the following IR may lead one single barrier be called twice at runtime.
A: barrier() instructions after barrier() B: ... BR(cond) A C: ... BR A When it runs to B's BR instruction, and if any of the condition bits is true, it will jump to block A to execute the barrier. Then latter, if any of the condition bits is false, it will continue to execute the block C's code and at the end of the C block, it jump to A to execute the barrier again. If on the other thread, all the condition bits are true, then it triggers a hang. And even if all the threads run the same count of barrier, it may cause incorrect result, as it executes the instructions after barrier() in block A before all the work items hit the barrier point. The solution to fix this issue is to use a soft mask register. The register is shared by all barrier call. We initialize it to !emask at the beginning of the program. barrierMask = !emask. Then when it runs into the barrier call, we set current predication bits to the mask register, and check whether all the lanes are set. If any of the lanes is disabled, we simply jump to next basic block. Then latter when it runs into barrier again, we can set more bits/lanes to 1, and check it again, if all the bits are 1, then we set the preciation flag 0,0 to all 1 and execute the barrier call and after the wait, we reinitialize the barrierMask to !emask, and run all the other instructions after the barrier() in block A with all lanes enabled. After this patch, we can fix the hang issue when testing the opencv's transpose test cases. Signed-off-by: Zhigang Gong <[email protected]> --- backend/src/backend/context.cpp | 5 ++- backend/src/backend/gen_context.cpp | 47 ++++++++++++++++++++++++++++ backend/src/backend/gen_insn_selection.cpp | 38 +++++++++++----------- backend/src/backend/gen_insn_selection.hpp | 1 + backend/src/backend/gen_reg_allocation.cpp | 7 +++-- backend/src/backend/program.h | 1 + backend/src/ir/profile.cpp | 4 ++- backend/src/ir/profile.hpp | 5 +-- 8 files changed, 82 insertions(+), 26 deletions(-) diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp index 88375ad..d389d01 100644 --- a/backend/src/backend/context.cpp +++ b/backend/src/backend/context.cpp @@ -432,6 +432,7 @@ namespace gbe this->insertCurbeReg(ir::ocl::blockip, this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t))); this->insertCurbeReg(ir::ocl::emask, this->newCurbeEntry(GBE_CURBE_EMASK, 0, sizeof(uint32_t))); this->insertCurbeReg(ir::ocl::notemask, this->newCurbeEntry(GBE_CURBE_NOT_EMASK, 0, sizeof(uint32_t))); + this->insertCurbeReg(ir::ocl::barriermask, this->newCurbeEntry(GBE_CURBE_BARRIER_MASK, 0, sizeof(uint32_t))); // Go over the arguments and find the related patch locations const uint32_t argNum = fn.argNum(); @@ -687,7 +688,9 @@ namespace gbe reg == ir::ocl::goffset2 || reg == ir::ocl::workdim || reg == ir::ocl::emask || - reg == ir::ocl::notemask) + reg == ir::ocl::notemask || + reg == ir::ocl::barriermask + ) return true; return false; } diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 97ae2b3..d906bf1 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -100,6 +100,7 @@ namespace gbe /* clear all the bit in f0.0. */ p->curr.execWidth = 1; p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW), GenRegister::immud(0x0000)); + /* clear the barrier mask bits to all zero0*/ p->curr.noMask = 0; p->curr.useFlag(0, 0); p->curr.execWidth = execWidth; @@ -109,6 +110,7 @@ namespace gbe p->curr.execWidth = 1; p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW)); p->XOR(notEmaskReg, emaskReg, GenRegister::immud(0xFFFF)); + p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), notEmaskReg); p->pop(); } @@ -1391,7 +1393,52 @@ namespace gbe void GenContext::emitBarrierInstruction(const SelectionInstruction &insn) { const GenRegister src = ra->genReg(insn.src(0)); + const GenRegister fenceDst = ra->genReg(insn.dst(0)); + uint32_t barrierType = insn.extra.barrierType; + const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid)); + + uint32_t exeWidth = p->curr.execWidth; + p->push(); + uint32_t label = insn.parent->bb->getNextBlock()->getLabelIndex(); + this->branchPos2.push_back(std::make_pair(label, p->n_instruction())); + if (exeWidth == 16) + p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H; + else if (exeWidth == 8) + p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H; + else + NOT_IMPLEMENTED; + p->curr.inversePredicate = 1; + p->curr.execWidth = 1; + p->curr.noMask = 1; + // If not all channel is set to 1, the barrier is still waiting for other lanes to complete, + // jump to next basic block. + p->JMPI(GenRegister::immud(0)); + p->curr.predicate = GEN_PREDICATE_NONE; + p->MOV(GenRegister::flag(0, 0), ra->genReg(GenRegister::uw1grf(ir::ocl::emask))); + p->pop(); + + p->push(); + p->curr.useFlag(0, 0); + if (barrierType == ir::syncGlobalBarrier) { + p->FENCE(fenceDst); + p->MOV(fenceDst, fenceDst); + } + p->curr.predicate = GEN_PREDICATE_NONE; + // As only the payload.2 is used and all the other regions are ignored + // SIMD8 mode here is safe. + p->curr.execWidth = 8; + p->curr.physicalFlag = 0; + p->curr.noMask = 1; + // Copy barrier id from r0. + p->AND(src, barrierId, GenRegister::immud(0x0f000000)); + // A barrier is OK to start the thread synchronization *and* SLM fence p->BARRIER(src); + // Now we wait for the other threads + p->curr.execWidth = 1; + p->WAIT(); + // we executed the barrier then restore the barrier soft mask to initial value. + p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), ra->genReg(GenRegister::uw1grf(ir::ocl::notemask))); + p->pop(); } void GenContext::emitFenceInstruction(const SelectionInstruction &insn) { diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index ecff0f2..c9048e5 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -493,7 +493,7 @@ namespace gbe /*! Saturated subtraction of 64-bit integer */ void I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]); /*! Encode a barrier instruction */ - void BARRIER(GenRegister src); + void BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType); /*! Encode a barrier instruction */ void FENCE(GenRegister dst); /*! Encode a label instruction */ @@ -818,9 +818,11 @@ namespace gbe insn->index = uint16_t(index); } - void Selection::Opaque::BARRIER(GenRegister src) { - SelectionInstruction *insn = this->appendInsn(SEL_OP_BARRIER, 0, 1); + void Selection::Opaque::BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType) { + SelectionInstruction *insn = this->appendInsn(SEL_OP_BARRIER, 1, 1); insn->src(0) = src; + insn->dst(0) = fence; + insn->extra.barrierType = barrierType; } void Selection::Opaque::FENCE(GenRegister dst) { @@ -2235,29 +2237,25 @@ namespace gbe { using namespace ir; const ir::Register reg = sel.reg(FAMILY_DWORD); - + const GenRegister barrierMask = sel.selReg(ocl::barriermask, TYPE_BOOL); + const GenRegister tempFlag = sel.selReg(sel.reg(FAMILY_BOOL), TYPE_BOOL); + const GenRegister flagReg = GenRegister::flag(0, 0); const uint32_t params = insn.getParameters(); - if(params == syncGlobalBarrier) { - const ir::Register fenceDst = sel.reg(FAMILY_DWORD); - sel.FENCE(sel.selReg(fenceDst, ir::TYPE_U32)); - } sel.push(); sel.curr.predicate = GEN_PREDICATE_NONE; - - // As only the payload.2 is used and all the other regions are ignored - // SIMD8 mode here is safe. - sel.curr.execWidth = 8; - sel.curr.physicalFlag = 0; sel.curr.noMask = 1; - // Copy barrier id from r0. - sel.AND(GenRegister::ud8grf(reg), GenRegister::ud1grf(ir::ocl::barrierid), GenRegister::immud(0x0f000000)); - - // A barrier is OK to start the thread synchronization *and* SLM fence - sel.BARRIER(GenRegister::f8grf(reg)); - // Now we wait for the other threads sel.curr.execWidth = 1; - sel.WAIT(); + sel.OR(barrierMask, flagReg, barrierMask); + sel.MOV(tempFlag, barrierMask); + sel.pop(); + + // A barrier is OK to start the thread synchronization *and* SLM fence + sel.push(); + //sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.flagIndex = (uint16_t)tempFlag.value.reg; + sel.curr.physicalFlag = 0; + sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), params); sel.pop(); return true; } diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp index 2422b2b..7566928 100644 --- a/backend/src/backend/gen_insn_selection.hpp +++ b/backend/src/backend/gen_insn_selection.hpp @@ -111,6 +111,7 @@ namespace gbe uint16_t scratchOffset; uint16_t scratchMsgHeader; }; + uint32_t barrierType; } extra; /*! Gen opcode */ uint8_t opcode; diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp index 02fc534..2e2be04 100644 --- a/backend/src/backend/gen_reg_allocation.cpp +++ b/backend/src/backend/gen_reg_allocation.cpp @@ -385,9 +385,10 @@ namespace gbe grfBooleans.insert(spill.reg); spill = interval; } - // We will a grf for the current register - else + // We will use a grf for the current register + else { grfBooleans.insert(reg); + } } else allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum])); @@ -638,6 +639,8 @@ namespace gbe this->intervals[ocl::emask].maxID = INT_MAX; this->intervals[ocl::notemask].minID = 0; this->intervals[ocl::notemask].maxID = INT_MAX; +// this->intervals[ocl::barriermask].minID = 0; +// this->intervals[ocl::barriermask].maxID = INT_MAX; this->intervals[ocl::retVal].minID = INT_MAX; this->intervals[ocl::retVal].maxID = -INT_MAX; diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h index 9a3570e..4705578 100644 --- a/backend/src/backend/program.h +++ b/backend/src/backend/program.h @@ -79,6 +79,7 @@ enum gbe_curbe_type { GBE_CURBE_THREAD_NUM, GBE_CURBE_EMASK, GBE_CURBE_NOT_EMASK, + GBE_CURBE_BARRIER_MASK, }; /*! Extra arguments use the negative range of sub-values */ diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp index 9c3f333..ef3ea28 100644 --- a/backend/src/ir/profile.cpp +++ b/backend/src/ir/profile.cpp @@ -40,7 +40,8 @@ namespace ir { "stack_pointer", "block_ip", "barrier_id", "thread_number", - "work_dimension", "sampler_info", "emask", "notemask", "retVal" + "work_dimension", "sampler_info", + "emask", "notemask", "barriermask", "retVal" }; #if GBE_DEBUG @@ -79,6 +80,7 @@ namespace ir { DECL_NEW_REG(FAMILY_WORD, samplerinfo); DECL_NEW_REG(FAMILY_WORD, emask); DECL_NEW_REG(FAMILY_WORD, notemask); + DECL_NEW_REG(FAMILY_WORD, barriermask); DECL_NEW_REG(FAMILY_WORD, retVal); } #undef DECL_NEW_REG diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp index 90c504f..1e11fbf 100644 --- a/backend/src/ir/profile.hpp +++ b/backend/src/ir/profile.hpp @@ -67,8 +67,9 @@ namespace ir { static const Register samplerinfo = Register(23); // store sampler info. static const Register emask = Register(24); // store the emask bits for the branching fix. static const Register notemask = Register(25); // store the !emask bits for the branching fix. - static const Register retVal = Register(26); // helper register to do data flow analysis. - static const uint32_t regNum = 27; // number of special registers + static const Register barriermask = Register(26); // store the !emask bits for the branching fix. + static const Register retVal = Register(27); // helper register to do data flow analysis. + static const uint32_t regNum = 28; // number of special registers extern const char *specialRegMean[]; // special register name. } /* namespace ocl */ -- 1.7.9.5 _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
