From: Junyan He <[email protected]>

We will maintain a real clock to record the real execute time
of the orginal code. We do not want to introduce overhead
because of adding the profiling instructions, so every time
we enter the proliling instructions block, we will calculate the
real time clock value and update the real clock, and when leave
this the proliling instructions block, we will record the time
stamp of that leave point.

Signed-off-by: Junyan He <[email protected]>
---
 backend/src/backend/gen_context.cpp |  203 ++++++++++++++++++++++++++++++++++-
 1 file changed, 202 insertions(+), 1 deletion(-)

diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index bf5af41..956b3db 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2148,8 +2148,209 @@ namespace gbe
     p->TYPED_WRITE(header, true, bti);
   }
 
-  void GenContext::emitCalcTimestampInstruction(const SelectionInstruction 
&insn) {
+  /* We will record at most 20 timestamps, each one is 16bits. We also will 
record the
+     prolog and epilog timestamps in 64 bits. So the format of the curbe 
timestamp reg is:
+     ---------------------------------------------------------
+     | ts0  | ts1  | ts2  | ts3  | ts4  | ts5  | ts6  | ts7  |  profilingReg0
+     | ts8  | ts9  | ts10 | ts11 | ts12 | ts13 | ts14 | ts15 |  profilingReg1
+     | ts16 | ts17 | ts18 | ts19 |    prolog   |    epilog   |  profilingReg2
+     ---------------------------------------------------------
+     |    tmp0     |    tmp1     |lasttimestamp|  real clock |  profilingReg3
+     ---------------------------------------------------------
+     */
+  void GenContext::emitCalcTimestampInstruction(const SelectionInstruction 
&insn)
+  {
+    uint32_t pointNum = insn.extra.pointNum;
+    uint32_t tsType = insn.extra.timestampType;
+
+    GBE_ASSERT(tsType == 1);
+    GenRegister tmArf = GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+        0xc0,
+        0,
+        GEN_TYPE_UW,
+        GEN_VERTICAL_STRIDE_4,
+        GEN_WIDTH_4,
+        GEN_HORIZONTAL_STRIDE_1);
+    GenRegister profilingReg[4];
+    GenRegister tmp;
+    if (p->curr.execWidth == 16) {
+      profilingReg[0] = GenRegister::retype(ra->genReg(insn.src(0)), 
GEN_TYPE_UD);
+      profilingReg[1] = GenRegister::offset(profilingReg[0], 1);
+      profilingReg[2] = GenRegister::retype(ra->genReg(insn.src(1)), 
GEN_TYPE_UD);
+      profilingReg[3] = GenRegister::offset(profilingReg[2], 1);
+      tmp = GenRegister::retype(ra->genReg(insn.dst(2)), GEN_TYPE_UD);
+    } else {
+      GBE_ASSERT(p->curr.execWidth == 8);
+      profilingReg[0] = GenRegister::retype(ra->genReg(insn.src(0)), 
GEN_TYPE_UD);
+      profilingReg[1] = GenRegister::retype(ra->genReg(insn.src(1)), 
GEN_TYPE_UD);
+      profilingReg[2] = GenRegister::retype(ra->genReg(insn.src(2)), 
GEN_TYPE_UD);
+      profilingReg[3] = GenRegister::retype(ra->genReg(insn.src(3)), 
GEN_TYPE_UD);
+      tmp = GenRegister::retype(ra->genReg(insn.dst(4)), GEN_TYPE_UD);
+    }
+    GenRegister tmp0 = GenRegister::toUniform(profilingReg[3], GEN_TYPE_UL);
+    GenRegister lastTsReg = GenRegister::toUniform(profilingReg[3], 
GEN_TYPE_UL);
+    lastTsReg = GenRegister::offset(lastTsReg, 0, 2*sizeof(uint64_t));
+    GenRegister realClock = GenRegister::offset(lastTsReg, 0, 
sizeof(uint64_t));
+
+    if (pointNum == 0xFF) { //the prolog, just record the prolog and 
lasttimestamp
+      /* MOV(4)   prolog<1>:UW   arf_tm<4,4,1>:UW  */
+      /* MOV(4)   lastTsReg<1>:UW   prolog<4,4,1>:UW  */
+      /* MOV(4)   realclock<1>:UW   tmArf<4,4,1>:UW  */
+      GenRegister prolog = profilingReg[2];
+      prolog.type = GEN_TYPE_UW;
+      prolog.hstride = GEN_HORIZONTAL_STRIDE_1;
+      prolog.vstride = GEN_VERTICAL_STRIDE_4;
+      prolog.width = GEN_WIDTH_4;
+      prolog = GenRegister::offset(prolog, 0, 4*sizeof(uint32_t));
+      GenRegister _lastTsReg = lastTsReg;
+      _lastTsReg.type = GEN_TYPE_UW;
+      _lastTsReg.hstride = GEN_HORIZONTAL_STRIDE_1;
+      _lastTsReg.vstride = GEN_VERTICAL_STRIDE_4;
+      _lastTsReg.width = GEN_WIDTH_4;
+
+      p->push(); {
+        p->curr.execWidth = 4;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
+        p->MOV(prolog, tmArf);
+        p->MOV(_lastTsReg, tmArf);
+      } p->pop();
+
+      return;
+    }
+
+    /* MOV(4)   tmp0<1>:UW      arf_tm<4,4,1>:UW  */
+    p->push(); {
+      p->curr.execWidth = 4;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      GenRegister _tmp0 = tmp0;
+      _tmp0.type = GEN_TYPE_UW;
+      _tmp0.hstride = GEN_HORIZONTAL_STRIDE_1;
+      _tmp0.vstride = GEN_VERTICAL_STRIDE_4;
+      _tmp0.width = GEN_WIDTH_4;
+      p->MOV(_tmp0, tmArf);
+    } p->pop();
+
+    /* Calc the time elapsed.
+       SUB(1)  tmp0<1>:UL  tmp0<1>:UL   lastTS<0,1,0>
+       ADD(1)  tmp0<1>:UL  tmp0<1>:UL   0xFFFFFFFFFFFFFFFF     //Mod OP */
+    p->push(); {
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->SUBB(GenRegister::retype(tmp0, GEN_TYPE_UD),
+          GenRegister::retype(tmp0, GEN_TYPE_UD), 
GenRegister::retype(lastTsReg, GEN_TYPE_UD));
+      /* FIXME We can not get the acc register's value correctly by set simd = 
1. */
+      p->curr.execWidth = 8;
+      p->MOV(tmp, GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD));
+      p->curr.execWidth = 1;
+      p->ADD(GenRegister::retype(GenRegister::offset(tmp0, 0, 
sizeof(uint32_t)), GEN_TYPE_UD),
+          GenRegister::retype(GenRegister::offset(tmp0, 0, sizeof(uint32_t)), 
GEN_TYPE_UD),
+          GenRegister::negate(GenRegister::toUniform(tmp, GEN_TYPE_UD)));
+      p->ADD(GenRegister::retype(GenRegister::offset(tmp0, 0, 
sizeof(uint32_t)), GEN_TYPE_UD),
+          GenRegister::retype(GenRegister::offset(tmp0, 0, sizeof(uint32_t)), 
GEN_TYPE_UD),
+          
GenRegister::negate(GenRegister::retype(GenRegister::offset(lastTsReg, 0, 
sizeof(uint32_t)), GEN_TYPE_UD)));
+      // Mod 0xFFFFFFFF FFFFFFFF
+      p->ADDC(GenRegister::retype(tmp0, GEN_TYPE_UD),
+          GenRegister::retype(tmp0, GEN_TYPE_UD), 
GenRegister::immud(0xffffffff));
+      p->curr.execWidth = 8;
+      p->MOV(tmp, GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD));
+      p->curr.execWidth = 1;
+      p->ADD(GenRegister::retype(GenRegister::offset(tmp0, 0, 
sizeof(uint32_t)), GEN_TYPE_UD),
+          GenRegister::retype(GenRegister::offset(tmp0, 0, sizeof(uint32_t)), 
GEN_TYPE_UD),
+          GenRegister::toUniform(tmp, GEN_TYPE_UD));
+      p->ADD(GenRegister::retype(GenRegister::offset(tmp0, 0, 
sizeof(uint32_t)), GEN_TYPE_UD),
+          GenRegister::retype(GenRegister::offset(tmp0, 0, sizeof(uint32_t)), 
GEN_TYPE_UD),
+          GenRegister::immud(0xffffffff));
+    } p->pop();
+
+    /* Update the real clock
+       ADD(1)   realclock<1>:UL  realclock<1>:UL  tmp0<1>:UL */
+    p->push(); {
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->ADDC(GenRegister::retype(realClock, GEN_TYPE_UD),
+          GenRegister::retype(realClock, GEN_TYPE_UD), 
GenRegister::retype(tmp0, GEN_TYPE_UD));
+      p->curr.execWidth = 8;
+      p->MOV(tmp, GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD));
+      p->curr.execWidth = 1;
+      p->ADD(GenRegister::retype(GenRegister::offset(realClock, 0, 
sizeof(uint32_t)), GEN_TYPE_UD),
+          GenRegister::retype(GenRegister::offset(realClock, 0, 
sizeof(uint32_t)), GEN_TYPE_UD),
+          GenRegister::offset(GenRegister::toUniform(tmp, GEN_TYPE_UD), 0, 
6*sizeof(uint32_t)));
+      p->ADD(GenRegister::retype(GenRegister::offset(realClock, 0, 
sizeof(uint32_t)), GEN_TYPE_UD),
+          GenRegister::retype(GenRegister::offset(realClock, 0, 
sizeof(uint32_t)), GEN_TYPE_UD),
+          GenRegister::retype(GenRegister::offset(tmp0, 0, sizeof(uint32_t)), 
GEN_TYPE_UD));
+    } p->pop();
+
+    if (pointNum == 0xFE) { //the epilog, record the last timestamp and return.
+      /* MOV(1)   epilog<1>:UL   realclock<0,1,0>:UL  */
+      /* ADD(1)   epilog<1>:UL   prolog<0,1,0>:UL  */
+      GenRegister prolog = GenRegister::toUniform(profilingReg[2], 
GEN_TYPE_UD);
+      prolog = GenRegister::offset(prolog, 0, 4*sizeof(uint32_t));
+      GenRegister epilog = GenRegister::offset(prolog, 0, 2*sizeof(uint32_t));
+      p->push(); {
+        p->curr.execWidth = 1;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
+        p->MOV(epilog, GenRegister::retype(realClock, GEN_TYPE_UD));
+        p->MOV(GenRegister::offset(epilog, 0, sizeof(uint32_t)),
+            GenRegister::offset(GenRegister::retype(realClock, GEN_TYPE_UD), 
0, sizeof(uint32_t)));
+
+        p->ADDC(GenRegister::retype(epilog, GEN_TYPE_UD),
+            GenRegister::retype(epilog, GEN_TYPE_UD), 
GenRegister::retype(prolog, GEN_TYPE_UD));
+        p->curr.execWidth = 8;
+        p->MOV(tmp, GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD));
+        p->curr.execWidth = 1;
+        p->ADD(GenRegister::retype(GenRegister::offset(epilog, 0, 
sizeof(uint32_t)), GEN_TYPE_UD),
+            GenRegister::retype(GenRegister::offset(epilog, 0, 
sizeof(uint32_t)), GEN_TYPE_UD),
+            GenRegister::offset(GenRegister::toUniform(tmp, GEN_TYPE_UD), 0, 
6*sizeof(uint32_t)));
+        p->ADD(GenRegister::retype(GenRegister::offset(epilog, 0, 
sizeof(uint32_t)), GEN_TYPE_UD),
+            GenRegister::retype(GenRegister::offset(epilog, 0, 
sizeof(uint32_t)), GEN_TYPE_UD),
+            GenRegister::retype(GenRegister::offset(prolog, 0, 
sizeof(uint32_t)), GEN_TYPE_UD));
+      } p->pop();
+
+      return;
+    }
+
+    /* We just record timestamp of the first time this point is reached. If 
the this point is
+       in loop, it can be reached many times. We will not record the later 
timestamps. The 32bits
+       timestamp can represent about 3.2s, one each kernel's execution time 
should never exceed
+       3s. So we just record the low 32 bits.
+       CMP.EQ(1)flag0.1        NULL          tsReg_n<1>:UD  0x0
+       (+flag0.1) MOV(1)   tsReg_n<1>:UD  realclock<1>:UD  Just record the low 
32bits
+       */
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, 
insn.state.subFlag);
+    GenRegister tsReg = GenRegister::toUniform(profilingReg[pointNum/8], 
GEN_TYPE_UD);
+    tsReg = GenRegister::offset(tsReg, 0, (pointNum%8)*sizeof(uint32_t));
+
+    p->push(); {
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_EQ, tsReg, GenRegister::immud(0));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->curr.inversePredicate = 0;
+      p->MOV(tsReg, GenRegister::retype(GenRegister::retype(realClock, 
GEN_TYPE_UD), GEN_TYPE_UD));
+    } p->pop();
+
+    /* Store the timestamp for next point use.
+       MOV(4)   lastTS<1>:UW     arf_tm<4,4,1>:UW  */
+    p->push(); {
+      p->curr.execWidth = 4;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      GenRegister _lastTsReg = lastTsReg;
+      _lastTsReg.type = GEN_TYPE_UW;
+      _lastTsReg.hstride = GEN_HORIZONTAL_STRIDE_1;
+      _lastTsReg.vstride = GEN_VERTICAL_STRIDE_4;
+      _lastTsReg.width = GEN_WIDTH_4;
+      p->MOV(_lastTsReg, tmArf);
+    } p->pop();
   }
+
   void GenContext::emitStoreProfilingInstruction(const SelectionInstruction 
&insn) {
 
   }
-- 
1.7.9.5

_______________________________________________
Beignet mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/beignet

Reply via email to