RESULTS (discussion/explanations in comments) A. thread comm master => 1.608 Msum/S, 408ms B. thread comm master + add4 => 1.730 Msum/S, 378ms C. (this) slm/barrier + add4, 8 READ/thread => 2.495 Msum/S, ~262ms D. (this) slm/barrier + add4, 4 READ/thread => 3.813 Msum/S
Signed-off-by: Grigore Lupescu <grigore.lupe...@intel.com> --- backend/src/backend/gen_context.cpp | 39 +++++++++---- backend/src/backend/gen_insn_selection.cpp | 90 ++++++++++++++++++++++++++++-- 2 files changed, 112 insertions(+), 17 deletions(-) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 0ea0dd0..3fcc8ce 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -2943,21 +2943,32 @@ namespace gbe } } } - } else if (wg_op == ir::WORKGROUP_OP_REDUCE_ADD) { + } + else if (wg_op == ir::WORKGROUP_OP_REDUCE_ADD){ + tmp.hstride = GEN_HORIZONTAL_STRIDE_1; + tmp.vstride = GEN_VERTICAL_STRIDE_4; + tmp.width = GEN_WIDTH_4; + GBE_ASSERT(tmp.type == theVal.type); - GenRegister v = GenRegister::toUniform(tmp, theVal.type); - for (uint32_t i = 0; i < simd; i++) { - p->ADD(threadData, threadData, v); - v.subnr += typeSize(theVal.type); - if (v.subnr == 32) { - v.subnr = 0; - v.nr++; - } + GenRegister partialSum = tmp; + + /* adjust offset, compute add with ADD4/ADD */ + for (uint32_t i = 1; i < simd/4; i++){ + tmp = tmp.suboffset(tmp, 4); + p->push(); + p->curr.execWidth = GEN_WIDTH_16; + p->ADD(partialSum, partialSum, tmp); + p->pop(); } - } + for (uint32_t i = 0; i < 4; i++){ + partialSum.width = GEN_WIDTH_1; + p->ADD(threadData, threadData, partialSum); + partialSum = GenRegister::suboffset(partialSum, 1); + } + } p->pop(); - } +} #define SEND_RESULT_MSG() \ do { \ @@ -3028,6 +3039,8 @@ do { \ workgroupOpInThread(msgData, theVal, threadData, tmp, simd, wg_op, p); } p->pop(); +/* deactivate code for other OPs - EXPERIMENTAL ADD_REDUCE only */ +#if 0 /* If we are the only one thread, no need to send msg, just broadcast the result.*/ p->push(); { p->curr.predicate = GEN_PREDICATE_NONE; @@ -3123,7 +3136,6 @@ do { \ p->curr.predicate = GEN_PREDICATE_NONE; p->WAIT(2); p->patchJMPI(jip, (p->n_instruction() - jip), 0); - /* Do something when get the msg. */ p->curr.execWidth = simd; p->MOV(dst, msgData); @@ -3143,6 +3155,9 @@ do { \ p->patchJMPI(jip, (p->n_instruction() - jip), 0); } p->pop(); } +#endif + + p->MOV(dst, threadData); if (oneThreadJip >=0) p->patchJMPI(oneThreadJip, (p->n_instruction() - oneThreadJip), 0); diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 001a3c5..3fa03a3 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -6042,9 +6042,9 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp sel.curr.noMask = 1; sel.curr.execWidth = 8; - sel.MOV(tmp, sr0_0); + sel.MOV(tmp, sel.selReg(ocl::threadid, ir::TYPE_U32)); - sel.MUL(addr, sel.selReg(ocl::threadid, ir::TYPE_U32), GenRegister::immud(2)); + sel.MUL(addr, sel.selReg(ocl::threadid, ir::TYPE_U32), GenRegister::immud(4)); sel.ADD(addr, addr, GenRegister::immud(slmAddr)); sel.push(); { @@ -6086,7 +6086,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp sel.curr.execWidth = 8; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; - sel.ADD(nextThread, sel.selReg(ocl::threadid, ir::TYPE_U32), GenRegister::immud(1)); + sel.ADD(nextThread, sel.selReg(ocl::threadid, ir::TYPE_U32), GenRegister::immud(0)); /* Wrap the next thread id. */ sel.push(); { @@ -6192,7 +6192,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp if (workGroupOp == WORKGROUP_OP_BROADCAST) { return emitWGBroadcast(sel, insn); - } else if (workGroupOp >= WORKGROUP_OP_REDUCE_ADD && workGroupOp <= WORKGROUP_OP_EXCLUSIVE_MAX) { + } else if (workGroupOp >= WORKGROUP_OP_REDUCE_MIN && workGroupOp <= WORKGROUP_OP_EXCLUSIVE_MAX) { const uint32_t slmAddr = insn.getSlmAddr(); /* First, we create the TheadID/localID map, in order to get which thread hold the next 16 workitems. */ @@ -6223,7 +6223,87 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp sel.curr.subFlag = 1; sel.WORKGROUP_OP(workGroupOp, dst, src, nextThreadID, threadID, threadNum, tmp); } sel.pop(); - } else { + } + else if (workGroupOp == WORKGROUP_OP_REDUCE_ADD) { /* EXPERIMENTAL */ + + const Type type = insn.getType(); + GenRegister dst = sel.selReg(insn.getDst(0), type); + const uint32_t srcNum = insn.getSrcNum(); + + GBE_ASSERT(srcNum == 3); + GBE_ASSERT(insn.getSrc(0) == ir::ocl::threadn); + GBE_ASSERT(insn.getSrc(1) == ir::ocl::threadid); + + GenRegister threadID = sel.selReg(ocl::threadid, ir::TYPE_U32); + GenRegister threadNum = sel.selReg(ocl::threadn, ir::TYPE_U32); + GenRegister tmp = GenRegister::retype( + sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD); + GenRegister src = sel.selReg(insn.getSrc(2), type); + + GenRegister nextThreadID = sel.selReg(sel.reg(FAMILY_WORD), type); + GenRegister result = sel.selReg(sel.reg(FAMILY_WORD), type); + GenRegister addr = GenRegister::retype( + sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32), GEN_TYPE_UD); + GenRegister data_in = sel.selReg(sel.reg(FAMILY_WORD), type); + + vector<GenRegister> fakeTemps; + fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_WORD), type)); + fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_WORD), type)); + + const uint32_t slmAddr = insn.getSlmAddr(); + + /* compute individual slice of workitems, (e.g. 0->16 workitems) */ + sel.WORKGROUP_OP(workGroupOp, result, src, + nextThreadID, threadID, threadNum, tmp); + + /* write result data to SLM with offset using threadID*/ + sel.MUL(addr, sel.selReg(ocl::threadid, ir::TYPE_U32), + GenRegister::immud(4)); + sel.ADD(addr, addr, GenRegister::immud(slmAddr)); + + /****** TODO (1) OPTIMIZE STORAGE - maybe store data more efficient + * or more compact ? + */ + sel.UNTYPED_WRITE(addr, + &result, 1, GenRegister::immw(0xfe), fakeTemps); + + /* barrier, all threads have finished computing and writing results */ + /****** TODO (2) OPTIMIZE BARRIER - not sure if all flags are required + * Maybe other methods to ensure data has been written to SLM ? */ + sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), + sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier); + + /* read data from SLM and compute */ + sel.MOV(addr, GenRegister::immud(slmAddr)); + sel.MOV(dst, GenRegister::immud(0)); + + /****** TODO (3) OPTIMIZE SLM - code is inneficient at SLM read... + * Each thread regardless of other threads reads from SLM + * exactly 8 results - high performance penalty. Solutions: + * 1. Use threadnum and IF conditions. Performance penalty from IF ? + * 2. Read multiple elements with UNTYPED_READ ? Sync read threads ? + * 3. Use TYPED_READ ? Other methods to read SLM ? + * + * RESULTS + * A. thread comm master => 1.608 Msum/S, 408ms + * B. thread comm master + add4 => 1.730 Msum/S, 378ms + * C. (this) slm/barrier + add4, 8 READ/thread => 2.495 Msum/S, ~262ms + * D. (this) slm/barrier + add4, 4 READ/thread => 3.813 Msum/S + * + * DEFAULT implementation (with 8 results read from SLM per thread) + * NOTE: After the SLM read, each thread has its own set of results + * computed by other threads - each thread can work on the final + * result (in parallel) without the need for other communication + */ + for(int i=0; i<8; i++){ + sel.UNTYPED_READ(addr, + &data_in, 1, GenRegister::immw(0xFE), fakeTemps); + sel.ADD(addr, addr, GenRegister::immud(4)); + sel.ADD(dst, dst, data_in); + } + } + + else { GBE_ASSERT(0); } -- 2.1.4 _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet