When group size not aligned to simdWidth, prediction any8/16h will calculate pmask also using flag register bits mapped to non-active lanes. As flag register is not cleared by default, any8/16h used for jmpi instruction may cause wrong jump, and possibly infinite loop.
So, we clear Flag register to 0 to make any8/16h prediction work correct. Signed-off-by: Ruiling Song <[email protected]> --- backend/src/backend/gen_context.cpp | 13 +++++++++++++ backend/src/backend/gen_context.hpp | 1 + backend/src/backend/gen_insn_selection.cpp | 3 +++ 3 files changed, 17 insertions(+) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 6eeab51..a029719 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -88,6 +88,18 @@ namespace gbe } } + void GenContext::clearFlagRegister(void) { + // when group size not aligned to simdWidth, flag register need clear to + // make prediction(any8/16h) work correctly + p->push(); + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->curr.execWidth = 1; + p->MOV(GenRegister::retype(GenRegister::flag(0,0), GEN_TYPE_UD), GenRegister::immud(0x0)); + p->MOV(GenRegister::retype(GenRegister::flag(1,0), GEN_TYPE_UD), GenRegister::immud(0x0)); + p->pop(); + } + void GenContext::emitStackPointer(void) { using namespace ir; @@ -1091,6 +1103,7 @@ namespace gbe schedulePostRegAllocation(*this, *this->sel); if (OCL_OUTPUT_REG_ALLOC) ra->outputAllocation(); + this->clearFlagRegister(); this->emitStackPointer(); this->emitInstructionStream(); this->patchBranches(); diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp index 8b481d0..f66ec95 100644 --- a/backend/src/backend/gen_context.hpp +++ b/backend/src/backend/gen_context.hpp @@ -61,6 +61,7 @@ namespace gbe INLINE const ir::Function &getFunction(void) const { return fn; } /*! Simd width chosen for the current function */ INLINE uint32_t getSimdWidth(void) const { return simdWidth; } + void clearFlagRegister(void); /*! Emit the per-lane stack pointer computation */ void emitStackPointer(void); /*! Emit the instructions */ diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 55db48e..bca08ba 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -2621,6 +2621,9 @@ namespace gbe sel.CMP(GEN_CONDITIONAL_G, ip, GenRegister::immuw(nextLabel)); // Branch to the jump target + // XXX TODO: For group size not aligned to simdWidth, ALL8/16h may not + // work correct, as flag register bits mapped to non-active lanes tend + // to be zero. if (simdWidth == 8) sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H; else if (simdWidth == 16) -- 1.7.9.5 _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
