[Beignet] [PATCH 3/4] GBE: fix the potential issue when there are inactive lanes.

Zhigang Gong Mon, 30 Dec 2013 03:44:35 -0800

If there are some inactive lanes, then the JMPI with all16h
may fail to jum even all the active lanes are in false condition.


Then it may execute a BB with all zero flag, and when the BB
has some noMask/noPredication instructions, it may bring unexpected
result. this patch fixes this problem by the following method.
It use two UW register to fixup the flag result before each
JMP. Before the ALL16/8H JMPI, it set the inactive lane to 1s.
Before the ANY16/8 JMPI, it clear all the inactive lane to 0s.

It introduces one extra instruction before each predicatable JMPI
instruction. It causes a little bit overhead.

Signed-off-by: Zhigang Gong <[email protected]>
---
 backend/src/backend/context.cpp            |    9 ++++----
 backend/src/backend/gen_context.cpp        |   16 ++++++++++++--
 backend/src/backend/gen_insn_selection.cpp |   33 +++++++++++++++++++++++-----
 backend/src/backend/gen_reg_allocation.cpp |    7 ++++++
 backend/src/backend/program.h              |    4 +++-
 backend/src/ir/function.cpp                |    2 +-
 backend/src/ir/profile.cpp                 |    4 +++-
 backend/src/ir/profile.hpp                 |    6 +++--
 8 files changed, 64 insertions(+), 17 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 1bbe700..88375ad 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -430,6 +430,8 @@ namespace gbe
 
     // We insert the block IP mask first
     this->insertCurbeReg(ir::ocl::blockip, 
this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t)));
+    this->insertCurbeReg(ir::ocl::emask, this->newCurbeEntry(GBE_CURBE_EMASK, 
0,  sizeof(uint32_t)));
+    this->insertCurbeReg(ir::ocl::notemask, 
this->newCurbeEntry(GBE_CURBE_NOT_EMASK, 0, sizeof(uint32_t)));
 
     // Go over the arguments and find the related patch locations
     const uint32_t argNum = fn.argNum();
@@ -512,9 +514,6 @@ namespace gbe
     });
 #undef INSERT_REG
 
-    // Insert the number of threads
-    insertCurbeReg(ir::ocl::threadn, this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 
0, sizeof(uint32_t)));
-
     // Insert the stack buffer if used
     if (useStackPtr)
       insertCurbeReg(ir::ocl::stackptr, 
this->newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER, ptrSize));
@@ -686,7 +685,9 @@ namespace gbe
         reg == ir::ocl::goffset0  ||
         reg == ir::ocl::goffset1  ||
         reg == ir::ocl::goffset2  ||
-        reg == ir::ocl::workdim)
+        reg == ir::ocl::workdim   ||
+        reg == ir::ocl::emask     ||
+        reg == ir::ocl::notemask)
       return true;
     return false;
   }
diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 6fd2dea..97ae2b3 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -91,12 +91,24 @@ namespace gbe
   void GenContext::clearFlagRegister(void) {
     // when group size not aligned to simdWidth, flag register need clear to
     // make prediction(any8/16h) work correctly
+    const GenRegister emaskReg = 
ra->genReg(GenRegister::uw1grf(ir::ocl::emask));
+    const GenRegister notEmaskReg = 
ra->genReg(GenRegister::uw1grf(ir::ocl::notemask));
+    uint32_t execWidth = p->curr.execWidth;
     p->push();
     p->curr.predicate = GEN_PREDICATE_NONE;
     p->curr.noMask = 1;
+    /* clear all the bit in f0.0. */
     p->curr.execWidth = 1;
-    p->MOV(GenRegister::retype(GenRegister::flag(0,0), GEN_TYPE_UD), 
GenRegister::immud(0x0));
-    p->MOV(GenRegister::retype(GenRegister::flag(1,0), GEN_TYPE_UD), 
GenRegister::immud(0x0));
+    p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW), 
GenRegister::immud(0x0000));
+    p->curr.noMask = 0;
+    p->curr.useFlag(0, 0);
+    p->curr.execWidth = execWidth;
+    /* set all the active lane to 1. Inactive lane remains 0. */
+    p->CMP(GEN_CONDITIONAL_EQ, GenRegister::ud16grf(126, 0), 
GenRegister::ud16grf(126, 0));
+    p->curr.noMask = 1;
+    p->curr.execWidth = 1;
+    p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0), 
GEN_TYPE_UW));
+    p->XOR(notEmaskReg, emaskReg, GenRegister::immud(0xFFFF));
     p->pop();
   }
 
diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index 23e9da7..ecff0f2 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2882,6 +2882,14 @@ namespace gbe
       if (sel.ctx.hasJIP(&insn)) {
         const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
         sel.push();
+
+          sel.curr.noMask = 1;
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
+          GenRegister flagReg = GenRegister::flag(0, 0);
+          sel.AND(flagReg, flagReg, emaskReg);
+
           if (simdWidth == 8)
             sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
           else if (simdWidth == 16)
@@ -2889,10 +2897,8 @@ namespace gbe
           else
             NOT_IMPLEMENTED;
           sel.curr.inversePredicate = 1;
-          sel.curr.execWidth = 1;
           sel.curr.flag = 0;
           sel.curr.subFlag = 0;
-          sel.curr.noMask = 1;
           sel.JMPI(GenRegister::immd(0), jip);
         sel.pop();
       }
@@ -3032,6 +3038,9 @@ namespace gbe
         // It is slightly more complicated than for backward jump. We check 
that
         // all PcIPs are greater than the next block IP to be sure that we can
         // jump
+        // We set all the inactive channel to 1 as the 
GEN_PREDICATE_ALIGN1_ALL8/16
+        // will check those bits as well.
+
         sel.push();
           sel.curr.physicalFlag = 0;
           sel.curr.flagIndex = uint16_t(pred);
@@ -3042,14 +3051,19 @@ namespace gbe
           // XXX TODO: For group size not aligned to simdWidth, ALL8/16h may 
not
           // work correct, as flag register bits mapped to non-active lanes 
tend
           // to be zero.
+
+          sel.curr.execWidth = 1;
+          sel.curr.noMask = 1;
+          GenRegister notEmaskReg = GenRegister::uw1grf(ocl::notemask);
+          sel.OR(sel.selReg(pred, TYPE_U16), sel.selReg(pred, TYPE_U16), 
notEmaskReg);
+
           if (simdWidth == 8)
             sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
           else if (simdWidth == 16)
             sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
           else
             NOT_SUPPORTED;
-          sel.curr.execWidth = 1;
-          sel.curr.noMask = 1;
+
           sel.JMPI(GenRegister::immd(0), jip);
         sel.pop();
 
@@ -3084,6 +3098,7 @@ namespace gbe
       if (insn.isPredicated() == true) {
         const Register pred = insn.getPredicateIndex();
 
+
         // Update the PcIPs for all the branches. Just put the IPs of the next
         // block. Next instruction will properly reupdate the IPs of the lanes
         // that actually take the branch
@@ -3096,6 +3111,14 @@ namespace gbe
           sel.curr.flagIndex = uint16_t(pred);
           sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
 
+        // We clear all the inactive channel to 0 as the 
GEN_PREDICATE_ALIGN1_ALL8/16
+        // will check those bits as well.
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.execWidth = 1;
+          sel.curr.noMask = 1;
+          GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
+          sel.AND(sel.selReg(pred, TYPE_U16), sel.selReg(pred, TYPE_U16), 
emaskReg);
+
           // Branch to the jump target
           if (simdWidth == 8)
             sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
@@ -3103,8 +3126,6 @@ namespace gbe
             sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
           else
             NOT_SUPPORTED;
-          sel.curr.execWidth = 1;
-          sel.curr.noMask = 1;
           sel.JMPI(GenRegister::immd(0), jip);
         sel.pop();
 
diff --git a/backend/src/backend/gen_reg_allocation.cpp 
b/backend/src/backend/gen_reg_allocation.cpp
index 2bb0d19..02fc534 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -634,6 +634,13 @@ namespace gbe
         this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, 
lastID);
     }
 
+    this->intervals[ocl::emask].minID = 0;
+    this->intervals[ocl::emask].maxID = INT_MAX;
+    this->intervals[ocl::notemask].minID = 0;
+    this->intervals[ocl::notemask].maxID = INT_MAX;
+    this->intervals[ocl::retVal].minID = INT_MAX;
+    this->intervals[ocl::retVal].maxID = -INT_MAX;
+
     // Sort both intervals in starting point and ending point increasing orders
     const uint32_t regNum = ctx.sel->getRegNum();
     this->starting.resize(regNum);
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index e574764..9a3570e 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -76,7 +76,9 @@ enum gbe_curbe_type {
   GBE_CURBE_KERNEL_ARGUMENT,
   GBE_CURBE_EXTRA_ARGUMENT,
   GBE_CURBE_BLOCK_IP,
-  GBE_CURBE_THREAD_NUM
+  GBE_CURBE_THREAD_NUM,
+  GBE_CURBE_EMASK,
+  GBE_CURBE_NOT_EMASK,
 };
 
 /*! Extra arguments use the negative range of sub-values */
diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp
index c15c292..4273f66 100644
--- a/backend/src/ir/function.cpp
+++ b/backend/src/ir/function.cpp
@@ -227,7 +227,7 @@ namespace ir {
         GBE_ASSERT(target != NULL);
         target->predecessors.insert(&bb);
         bb.successors.insert(target);
-        if (insn.isPredicated() == true) jumpToNext = &bb;
+        if ( insn.isPredicated() == true) jumpToNext = &bb;
       }
     });
   }
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index b693c2b..9c3f333 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -40,7 +40,7 @@ namespace ir {
         "stack_pointer",
         "block_ip",
         "barrier_id", "thread_number",
-        "work_dimension", "sampler_info", "retVal"
+        "work_dimension", "sampler_info", "emask", "notemask", "retVal"
     };
 
 #if GBE_DEBUG
@@ -77,6 +77,8 @@ namespace ir {
       DECL_NEW_REG(FAMILY_DWORD, threadn);
       DECL_NEW_REG(FAMILY_DWORD, workdim);
       DECL_NEW_REG(FAMILY_WORD, samplerinfo);
+      DECL_NEW_REG(FAMILY_WORD, emask);
+      DECL_NEW_REG(FAMILY_WORD, notemask);
       DECL_NEW_REG(FAMILY_WORD, retVal);
     }
 #undef DECL_NEW_REG
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 2f16741..90c504f 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -65,8 +65,10 @@ namespace ir {
     static const Register threadn = Register(21);  // number of threads
     static const Register workdim = Register(22);  // work dimention.
     static const Register samplerinfo = Register(23); // store sampler info.
-    static const Register retVal = Register(24);   // helper register to do 
data flow analysis.
-    static const uint32_t regNum = 25;             // number of special 
registers
+    static const Register emask = Register(24);    // store the emask bits for 
the branching fix.
+    static const Register notemask = Register(25); // store the !emask bits 
for the branching fix.
+    static const Register retVal = Register(26);   // helper register to do 
data flow analysis.
+    static const uint32_t regNum = 27;             // number of special 
registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
-- 
1.7.9.5

_______________________________________________
Beignet mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/beignet

[Beignet] [PATCH 3/4] GBE: fix the potential issue when there are inactive lanes.

Reply via email to