If there are some inactive lanes, then the JMPI with all16h may fail to jum even all the active lanes are in false condition.
Then it may execute a BB with all zero flag, and when the BB has some noMask/noPredication instructions, it may bring unexpected result. this patch fixes this problem by the following method. It use two UW register to fixup the flag result before each JMP. Before the ALL16/8H JMPI, it set the inactive lane to 1s. Before the ANY16/8 JMPI, it clear all the inactive lane to 0s. It introduces one extra instruction before each predicatable JMPI instruction. It causes a little bit overhead. Signed-off-by: Zhigang Gong <[email protected]> --- backend/src/backend/context.cpp | 9 ++++---- backend/src/backend/gen_context.cpp | 16 ++++++++++++-- backend/src/backend/gen_insn_selection.cpp | 33 +++++++++++++++++++++++----- backend/src/backend/gen_reg_allocation.cpp | 7 ++++++ backend/src/backend/program.h | 4 +++- backend/src/ir/function.cpp | 2 +- backend/src/ir/profile.cpp | 4 +++- backend/src/ir/profile.hpp | 6 +++-- 8 files changed, 64 insertions(+), 17 deletions(-) diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp index 1bbe700..88375ad 100644 --- a/backend/src/backend/context.cpp +++ b/backend/src/backend/context.cpp @@ -430,6 +430,8 @@ namespace gbe // We insert the block IP mask first this->insertCurbeReg(ir::ocl::blockip, this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t))); + this->insertCurbeReg(ir::ocl::emask, this->newCurbeEntry(GBE_CURBE_EMASK, 0, sizeof(uint32_t))); + this->insertCurbeReg(ir::ocl::notemask, this->newCurbeEntry(GBE_CURBE_NOT_EMASK, 0, sizeof(uint32_t))); // Go over the arguments and find the related patch locations const uint32_t argNum = fn.argNum(); @@ -512,9 +514,6 @@ namespace gbe }); #undef INSERT_REG - // Insert the number of threads - insertCurbeReg(ir::ocl::threadn, this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 0, sizeof(uint32_t))); - // Insert the stack buffer if used if (useStackPtr) insertCurbeReg(ir::ocl::stackptr, this->newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER, ptrSize)); @@ -686,7 +685,9 @@ namespace gbe reg == ir::ocl::goffset0 || reg == ir::ocl::goffset1 || reg == ir::ocl::goffset2 || - reg == ir::ocl::workdim) + reg == ir::ocl::workdim || + reg == ir::ocl::emask || + reg == ir::ocl::notemask) return true; return false; } diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 6fd2dea..97ae2b3 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -91,12 +91,24 @@ namespace gbe void GenContext::clearFlagRegister(void) { // when group size not aligned to simdWidth, flag register need clear to // make prediction(any8/16h) work correctly + const GenRegister emaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::emask)); + const GenRegister notEmaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::notemask)); + uint32_t execWidth = p->curr.execWidth; p->push(); p->curr.predicate = GEN_PREDICATE_NONE; p->curr.noMask = 1; + /* clear all the bit in f0.0. */ p->curr.execWidth = 1; - p->MOV(GenRegister::retype(GenRegister::flag(0,0), GEN_TYPE_UD), GenRegister::immud(0x0)); - p->MOV(GenRegister::retype(GenRegister::flag(1,0), GEN_TYPE_UD), GenRegister::immud(0x0)); + p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW), GenRegister::immud(0x0000)); + p->curr.noMask = 0; + p->curr.useFlag(0, 0); + p->curr.execWidth = execWidth; + /* set all the active lane to 1. Inactive lane remains 0. */ + p->CMP(GEN_CONDITIONAL_EQ, GenRegister::ud16grf(126, 0), GenRegister::ud16grf(126, 0)); + p->curr.noMask = 1; + p->curr.execWidth = 1; + p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW)); + p->XOR(notEmaskReg, emaskReg, GenRegister::immud(0xFFFF)); p->pop(); } diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 23e9da7..ecff0f2 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -2882,6 +2882,14 @@ namespace gbe if (sel.ctx.hasJIP(&insn)) { const LabelIndex jip = sel.ctx.getLabelIndex(&insn); sel.push(); + + sel.curr.noMask = 1; + sel.curr.execWidth = 1; + sel.curr.predicate = GEN_PREDICATE_NONE; + GenRegister emaskReg = GenRegister::uw1grf(ocl::emask); + GenRegister flagReg = GenRegister::flag(0, 0); + sel.AND(flagReg, flagReg, emaskReg); + if (simdWidth == 8) sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H; else if (simdWidth == 16) @@ -2889,10 +2897,8 @@ namespace gbe else NOT_IMPLEMENTED; sel.curr.inversePredicate = 1; - sel.curr.execWidth = 1; sel.curr.flag = 0; sel.curr.subFlag = 0; - sel.curr.noMask = 1; sel.JMPI(GenRegister::immd(0), jip); sel.pop(); } @@ -3032,6 +3038,9 @@ namespace gbe // It is slightly more complicated than for backward jump. We check that // all PcIPs are greater than the next block IP to be sure that we can // jump + // We set all the inactive channel to 1 as the GEN_PREDICATE_ALIGN1_ALL8/16 + // will check those bits as well. + sel.push(); sel.curr.physicalFlag = 0; sel.curr.flagIndex = uint16_t(pred); @@ -3042,14 +3051,19 @@ namespace gbe // XXX TODO: For group size not aligned to simdWidth, ALL8/16h may not // work correct, as flag register bits mapped to non-active lanes tend // to be zero. + + sel.curr.execWidth = 1; + sel.curr.noMask = 1; + GenRegister notEmaskReg = GenRegister::uw1grf(ocl::notemask); + sel.OR(sel.selReg(pred, TYPE_U16), sel.selReg(pred, TYPE_U16), notEmaskReg); + if (simdWidth == 8) sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H; else if (simdWidth == 16) sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H; else NOT_SUPPORTED; - sel.curr.execWidth = 1; - sel.curr.noMask = 1; + sel.JMPI(GenRegister::immd(0), jip); sel.pop(); @@ -3084,6 +3098,7 @@ namespace gbe if (insn.isPredicated() == true) { const Register pred = insn.getPredicateIndex(); + // Update the PcIPs for all the branches. Just put the IPs of the next // block. Next instruction will properly reupdate the IPs of the lanes // that actually take the branch @@ -3096,6 +3111,14 @@ namespace gbe sel.curr.flagIndex = uint16_t(pred); sel.MOV(ip, GenRegister::immuw(uint16_t(dst))); + // We clear all the inactive channel to 0 as the GEN_PREDICATE_ALIGN1_ALL8/16 + // will check those bits as well. + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.execWidth = 1; + sel.curr.noMask = 1; + GenRegister emaskReg = GenRegister::uw1grf(ocl::emask); + sel.AND(sel.selReg(pred, TYPE_U16), sel.selReg(pred, TYPE_U16), emaskReg); + // Branch to the jump target if (simdWidth == 8) sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H; @@ -3103,8 +3126,6 @@ namespace gbe sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H; else NOT_SUPPORTED; - sel.curr.execWidth = 1; - sel.curr.noMask = 1; sel.JMPI(GenRegister::immd(0), jip); sel.pop(); diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp index 2bb0d19..02fc534 100644 --- a/backend/src/backend/gen_reg_allocation.cpp +++ b/backend/src/backend/gen_reg_allocation.cpp @@ -634,6 +634,13 @@ namespace gbe this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, lastID); } + this->intervals[ocl::emask].minID = 0; + this->intervals[ocl::emask].maxID = INT_MAX; + this->intervals[ocl::notemask].minID = 0; + this->intervals[ocl::notemask].maxID = INT_MAX; + this->intervals[ocl::retVal].minID = INT_MAX; + this->intervals[ocl::retVal].maxID = -INT_MAX; + // Sort both intervals in starting point and ending point increasing orders const uint32_t regNum = ctx.sel->getRegNum(); this->starting.resize(regNum); diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h index e574764..9a3570e 100644 --- a/backend/src/backend/program.h +++ b/backend/src/backend/program.h @@ -76,7 +76,9 @@ enum gbe_curbe_type { GBE_CURBE_KERNEL_ARGUMENT, GBE_CURBE_EXTRA_ARGUMENT, GBE_CURBE_BLOCK_IP, - GBE_CURBE_THREAD_NUM + GBE_CURBE_THREAD_NUM, + GBE_CURBE_EMASK, + GBE_CURBE_NOT_EMASK, }; /*! Extra arguments use the negative range of sub-values */ diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp index c15c292..4273f66 100644 --- a/backend/src/ir/function.cpp +++ b/backend/src/ir/function.cpp @@ -227,7 +227,7 @@ namespace ir { GBE_ASSERT(target != NULL); target->predecessors.insert(&bb); bb.successors.insert(target); - if (insn.isPredicated() == true) jumpToNext = &bb; + if ( insn.isPredicated() == true) jumpToNext = &bb; } }); } diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp index b693c2b..9c3f333 100644 --- a/backend/src/ir/profile.cpp +++ b/backend/src/ir/profile.cpp @@ -40,7 +40,7 @@ namespace ir { "stack_pointer", "block_ip", "barrier_id", "thread_number", - "work_dimension", "sampler_info", "retVal" + "work_dimension", "sampler_info", "emask", "notemask", "retVal" }; #if GBE_DEBUG @@ -77,6 +77,8 @@ namespace ir { DECL_NEW_REG(FAMILY_DWORD, threadn); DECL_NEW_REG(FAMILY_DWORD, workdim); DECL_NEW_REG(FAMILY_WORD, samplerinfo); + DECL_NEW_REG(FAMILY_WORD, emask); + DECL_NEW_REG(FAMILY_WORD, notemask); DECL_NEW_REG(FAMILY_WORD, retVal); } #undef DECL_NEW_REG diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp index 2f16741..90c504f 100644 --- a/backend/src/ir/profile.hpp +++ b/backend/src/ir/profile.hpp @@ -65,8 +65,10 @@ namespace ir { static const Register threadn = Register(21); // number of threads static const Register workdim = Register(22); // work dimention. static const Register samplerinfo = Register(23); // store sampler info. - static const Register retVal = Register(24); // helper register to do data flow analysis. - static const uint32_t regNum = 25; // number of special registers + static const Register emask = Register(24); // store the emask bits for the branching fix. + static const Register notemask = Register(25); // store the !emask bits for the branching fix. + static const Register retVal = Register(26); // helper register to do data flow analysis. + static const uint32_t regNum = 27; // number of special registers extern const char *specialRegMean[]; // special register name. } /* namespace ocl */ -- 1.7.9.5 _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
