[Beignet] [PATCH 4/4] GBE: use soft mask to handle the barrier call.

Zhigang Gong Mon, 30 Dec 2013 03:44:26 -0800

As the GPU is running under predication control, the following IR
may lead one single barrier be called twice at runtime.


A:
  barrier()
  instructions after barrier()

B:
  ...
  BR(cond) A

C:
  ...
  BR A

When it runs to B's BR instruction, and if any of the condition bits is
true, it will jump to block A to execute the barrier. Then latter, if
any of the condition bits is false, it will continue to execute the
block C's code and at the end of the C block, it jump to A to execute
the barrier again.

If on the other thread, all the condition bits are true, then it triggers
a hang.

And even if all the threads run the same count of barrier, it may cause
incorrect result, as it executes the instructions after barrier() in block
A before all the work items hit the barrier point.

The solution to fix this issue is to use a soft mask register. The register
is shared by all barrier call. We initialize it to !emask at the beginning
of the program.

barrierMask = !emask.

Then when it runs into the barrier call, we set current predication bits
to the mask register, and check whether all the lanes are set. If any of
the lanes is disabled, we simply jump to next basic block. Then latter
when it runs into barrier again, we can set more bits/lanes to 1, and
check it again, if all the bits are 1, then we set the preciation flag 0,0
to all 1 and execute the barrier call and after the wait, we reinitialize
the barrierMask to !emask, and run all the other instructions after the
barrier() in block A with all lanes enabled.

After this patch, we can fix the hang issue when testing the opencv's
transpose test cases.

Signed-off-by: Zhigang Gong <[email protected]>
---
 backend/src/backend/context.cpp            |    5 ++-
 backend/src/backend/gen_context.cpp        |   47 ++++++++++++++++++++++++++++
 backend/src/backend/gen_insn_selection.cpp |   38 +++++++++++-----------
 backend/src/backend/gen_insn_selection.hpp |    1 +
 backend/src/backend/gen_reg_allocation.cpp |    7 +++--
 backend/src/backend/program.h              |    1 +
 backend/src/ir/profile.cpp                 |    4 ++-
 backend/src/ir/profile.hpp                 |    5 +--
 8 files changed, 82 insertions(+), 26 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 88375ad..d389d01 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -432,6 +432,7 @@ namespace gbe
     this->insertCurbeReg(ir::ocl::blockip, 
this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t)));
     this->insertCurbeReg(ir::ocl::emask, this->newCurbeEntry(GBE_CURBE_EMASK, 
0,  sizeof(uint32_t)));
     this->insertCurbeReg(ir::ocl::notemask, 
this->newCurbeEntry(GBE_CURBE_NOT_EMASK, 0, sizeof(uint32_t)));
+    this->insertCurbeReg(ir::ocl::barriermask, 
this->newCurbeEntry(GBE_CURBE_BARRIER_MASK, 0, sizeof(uint32_t)));
 
     // Go over the arguments and find the related patch locations
     const uint32_t argNum = fn.argNum();
@@ -687,7 +688,9 @@ namespace gbe
         reg == ir::ocl::goffset2  ||
         reg == ir::ocl::workdim   ||
         reg == ir::ocl::emask     ||
-        reg == ir::ocl::notemask)
+        reg == ir::ocl::notemask  ||
+        reg == ir::ocl::barriermask
+      )
       return true;
     return false;
   }
diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 97ae2b3..d906bf1 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -100,6 +100,7 @@ namespace gbe
     /* clear all the bit in f0.0. */
     p->curr.execWidth = 1;
     p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW), 
GenRegister::immud(0x0000));
+    /* clear the barrier mask bits to all zero0*/
     p->curr.noMask = 0;
     p->curr.useFlag(0, 0);
     p->curr.execWidth = execWidth;
@@ -109,6 +110,7 @@ namespace gbe
     p->curr.execWidth = 1;
     p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0), 
GEN_TYPE_UW));
     p->XOR(notEmaskReg, emaskReg, GenRegister::immud(0xFFFF));
+    p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), notEmaskReg);
     p->pop();
   }
 
@@ -1391,7 +1393,52 @@ namespace gbe
 
   void GenContext::emitBarrierInstruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
+    const GenRegister fenceDst = ra->genReg(insn.dst(0));
+    uint32_t barrierType = insn.extra.barrierType;
+    const GenRegister barrierId = 
ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+
+    uint32_t exeWidth = p->curr.execWidth;
+    p->push();
+    uint32_t label = insn.parent->bb->getNextBlock()->getLabelIndex();
+    this->branchPos2.push_back(std::make_pair(label, p->n_instruction()));
+    if (exeWidth == 16)
+      p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
+    else if (exeWidth == 8)
+      p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
+    else
+      NOT_IMPLEMENTED;
+    p->curr.inversePredicate = 1;
+    p->curr.execWidth = 1;
+    p->curr.noMask = 1;
+    // If not all channel is set to 1, the barrier is still waiting for other 
lanes to complete,
+    // jump to next basic block.
+    p->JMPI(GenRegister::immud(0));
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->MOV(GenRegister::flag(0, 0), 
ra->genReg(GenRegister::uw1grf(ir::ocl::emask)));
+    p->pop();
+
+    p->push();
+    p->curr.useFlag(0, 0);
+    if (barrierType == ir::syncGlobalBarrier) {
+      p->FENCE(fenceDst);
+      p->MOV(fenceDst, fenceDst);
+    }
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    // As only the payload.2 is used and all the other regions are ignored
+    // SIMD8 mode here is safe.
+    p->curr.execWidth = 8;
+    p->curr.physicalFlag = 0;
+    p->curr.noMask = 1;
+    // Copy barrier id from r0.
+    p->AND(src, barrierId, GenRegister::immud(0x0f000000));
+    // A barrier is OK to start the thread synchronization *and* SLM fence
     p->BARRIER(src);
+    // Now we wait for the other threads
+    p->curr.execWidth = 1;
+    p->WAIT();
+    // we executed the barrier then restore the barrier soft mask to initial 
value.
+    p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), 
ra->genReg(GenRegister::uw1grf(ir::ocl::notemask)));
+    p->pop();
   }
 
   void GenContext::emitFenceInstruction(const SelectionInstruction &insn) {
diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index ecff0f2..c9048e5 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -493,7 +493,7 @@ namespace gbe
     /*! Saturated subtraction of 64-bit integer */
     void I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
     /*! Encode a barrier instruction */
-    void BARRIER(GenRegister src);
+    void BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType);
     /*! Encode a barrier instruction */
     void FENCE(GenRegister dst);
     /*! Encode a label instruction */
@@ -818,9 +818,11 @@ namespace gbe
     insn->index = uint16_t(index);
   }
 
-  void Selection::Opaque::BARRIER(GenRegister src) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_BARRIER, 0, 1);
+  void Selection::Opaque::BARRIER(GenRegister src, GenRegister fence, uint32_t 
barrierType) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_BARRIER, 1, 1);
     insn->src(0) = src;
+    insn->dst(0) = fence;
+    insn->extra.barrierType = barrierType;
   }
 
   void Selection::Opaque::FENCE(GenRegister dst) {
@@ -2235,29 +2237,25 @@ namespace gbe
     {
       using namespace ir;
       const ir::Register reg = sel.reg(FAMILY_DWORD);
-
+      const GenRegister barrierMask = sel.selReg(ocl::barriermask, TYPE_BOOL);
+      const GenRegister tempFlag = sel.selReg(sel.reg(FAMILY_BOOL), TYPE_BOOL);
+      const GenRegister flagReg = GenRegister::flag(0, 0);
       const uint32_t params = insn.getParameters();
-      if(params == syncGlobalBarrier) {
-        const ir::Register fenceDst = sel.reg(FAMILY_DWORD);
-        sel.FENCE(sel.selReg(fenceDst, ir::TYPE_U32));
-      }
 
       sel.push();
         sel.curr.predicate = GEN_PREDICATE_NONE;
-
-        // As only the payload.2 is used and all the other regions are ignored
-        // SIMD8 mode here is safe.
-        sel.curr.execWidth = 8;
-        sel.curr.physicalFlag = 0;
         sel.curr.noMask = 1;
-        // Copy barrier id from r0.
-        sel.AND(GenRegister::ud8grf(reg), 
GenRegister::ud1grf(ir::ocl::barrierid), GenRegister::immud(0x0f000000));
-
-        // A barrier is OK to start the thread synchronization *and* SLM fence
-        sel.BARRIER(GenRegister::f8grf(reg));
-        // Now we wait for the other threads
         sel.curr.execWidth = 1;
-        sel.WAIT();
+        sel.OR(barrierMask, flagReg, barrierMask);
+        sel.MOV(tempFlag, barrierMask);
+      sel.pop();
+
+      // A barrier is OK to start the thread synchronization *and* SLM fence
+      sel.push();
+      //sel.curr.predicate = GEN_PREDICATE_NONE;
+      sel.curr.flagIndex = (uint16_t)tempFlag.value.reg;
+      sel.curr.physicalFlag = 0;
+      sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), 
params);
       sel.pop();
       return true;
     }
diff --git a/backend/src/backend/gen_insn_selection.hpp 
b/backend/src/backend/gen_insn_selection.hpp
index 2422b2b..7566928 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -111,6 +111,7 @@ namespace gbe
         uint16_t scratchOffset;
         uint16_t scratchMsgHeader;
       };
+      uint32_t barrierType;
     } extra;
     /*! Gen opcode */
     uint8_t opcode;
diff --git a/backend/src/backend/gen_reg_allocation.cpp 
b/backend/src/backend/gen_reg_allocation.cpp
index 02fc534..2e2be04 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -385,9 +385,10 @@ namespace gbe
             grfBooleans.insert(spill.reg);
             spill = interval;
           }
-          // We will a grf for the current register
-          else
+          // We will use a grf for the current register
+          else {
             grfBooleans.insert(reg);
+          }
         }
         else
           allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum]));
@@ -638,6 +639,8 @@ namespace gbe
     this->intervals[ocl::emask].maxID = INT_MAX;
     this->intervals[ocl::notemask].minID = 0;
     this->intervals[ocl::notemask].maxID = INT_MAX;
+//    this->intervals[ocl::barriermask].minID = 0;
+//    this->intervals[ocl::barriermask].maxID = INT_MAX;
     this->intervals[ocl::retVal].minID = INT_MAX;
     this->intervals[ocl::retVal].maxID = -INT_MAX;
 
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 9a3570e..4705578 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -79,6 +79,7 @@ enum gbe_curbe_type {
   GBE_CURBE_THREAD_NUM,
   GBE_CURBE_EMASK,
   GBE_CURBE_NOT_EMASK,
+  GBE_CURBE_BARRIER_MASK,
 };
 
 /*! Extra arguments use the negative range of sub-values */
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 9c3f333..ef3ea28 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -40,7 +40,8 @@ namespace ir {
         "stack_pointer",
         "block_ip",
         "barrier_id", "thread_number",
-        "work_dimension", "sampler_info", "emask", "notemask", "retVal"
+        "work_dimension", "sampler_info",
+        "emask", "notemask", "barriermask", "retVal"
     };
 
 #if GBE_DEBUG
@@ -79,6 +80,7 @@ namespace ir {
       DECL_NEW_REG(FAMILY_WORD, samplerinfo);
       DECL_NEW_REG(FAMILY_WORD, emask);
       DECL_NEW_REG(FAMILY_WORD, notemask);
+      DECL_NEW_REG(FAMILY_WORD, barriermask);
       DECL_NEW_REG(FAMILY_WORD, retVal);
     }
 #undef DECL_NEW_REG
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 90c504f..1e11fbf 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -67,8 +67,9 @@ namespace ir {
     static const Register samplerinfo = Register(23); // store sampler info.
     static const Register emask = Register(24);    // store the emask bits for 
the branching fix.
     static const Register notemask = Register(25); // store the !emask bits 
for the branching fix.
-    static const Register retVal = Register(26);   // helper register to do 
data flow analysis.
-    static const uint32_t regNum = 27;             // number of special 
registers
+    static const Register barriermask = Register(26); // store the !emask bits 
for the branching fix.
+    static const Register retVal = Register(27);   // helper register to do 
data flow analysis.
+    static const uint32_t regNum = 28;             // number of special 
registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
-- 
1.7.9.5

_______________________________________________
Beignet mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/beignet

[Beignet] [PATCH 4/4] GBE: use soft mask to handle the barrier call.

Reply via email to