> -----Original Message----- > From: Yang, Rong R > Sent: Wednesday, April 2, 2014 4:41 PM > To: Gong, Zhigang; beignet@lists.freedesktop.org > Cc: Gong, Zhigang > Subject: RE: [Beignet] [PATCH 10/18] GBE: Disable SPF and use JMPI + IF/ENDIF > to handle each blocks. > > > > -----Original Message----- > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of > Zhigang Gong > Sent: Friday, March 28, 2014 3:11 PM > To: beignet@lists.freedesktop.org > Cc: Gong, Zhigang > Subject: [Beignet] [PATCH 10/18] GBE: Disable SPF and use JMPI + IF/ENDIF to > handle each blocks. > > When enable SPF (single program flow), we always need to use f0 > as the predication of almost each instruction. This bring some > trouble when we want to get tow levels mask mechanism, for an > example the SEL instruction, and some BOOL operations. We > have to use more than one instructions to do that and simply > introduce 100% of overhead of those instructions. > > Signed-off-by: Zhigang Gong <zhigang.g...@intel.com> > --- > backend/src/backend/gen/gen_mesa_disasm.c | 31 ++-- > backend/src/backend/gen_context.cpp | 141 ++++++++--------- > backend/src/backend/gen_defs.hpp | 5 + > backend/src/backend/gen_encoder.cpp | 29 +++- > backend/src/backend/gen_insn_selection.cpp | 237 > +++++++++++++---------------- > backend/src/backend/gen_insn_selection.hpp | 5 + > backend/src/backend/gen_insn_selection.hxx | 2 +- > backend/src/backend/gen_register.hpp | 13 +- > src/cl_api.c | 1 + > src/intel/intel_driver.c | 1 + > src/intel/intel_gpgpu.c | 2 +- > utests/compiler_long_cmp.cpp | 1 + > utests/compiler_unstructured_branch0.cpp | 4 +- > utests/compiler_unstructured_branch1.cpp | 3 +- > utests/compiler_unstructured_branch2.cpp | 10 +- > 15 files changed, 249 insertions(+), 236 deletions(-) > > diff --git a/backend/src/backend/gen/gen_mesa_disasm.c > b/backend/src/backend/gen/gen_mesa_disasm.c > index 84ef0c8..e58ef31 100644 > --- a/backend/src/backend/gen/gen_mesa_disasm.c > +++ b/backend/src/backend/gen/gen_mesa_disasm.c > @@ -100,13 +100,13 @@ static const struct { > [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 1, .ndst = 1 }, > [GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 }, > [GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 }, > - [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 1, .ndst = 0 }, > - [GEN_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 }, > - [GEN_OPCODE_BRC] = { .name = "brc", .nsrc = 1, .ndst = 0 }, > - [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 }, > - [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 }, > - [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 }, > - [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 }, > + [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 }, > + [GEN_OPCODE_IF] = { .name = "if", .nsrc = 0, .ndst = 0 }, > + [GEN_OPCODE_BRC] = { .name = "brc", .nsrc = 0, .ndst = 0 }, > + [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 0, .ndst = 0 }, > + [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 0, .ndst = 0 }, > + [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 0, .ndst = 0 }, > + [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 0, .ndst = 0 }, > [GEN_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 }, > [GEN_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 }, > [GEN_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 }, > @@ -1126,17 +1126,18 @@ int gen_disasm (FILE *file, const void > *opaque_insn) > } else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_IF || > inst->header.opcode == GEN_OPCODE_ELSE || > inst->header.opcode == GEN_OPCODE_ENDIF || > - inst->header.opcode == GEN_OPCODE_WHILE)) { > - // XXX format (file, " %d", inst->bits1.branch_gen6.jump_count); > - assert(0); > + inst->header.opcode == GEN_OPCODE_WHILE || > + inst->header.opcode == GEN_OPCODE_BRD || > + inst->header.opcode == GEN_OPCODE_JMPI)) { > + format(file, " %d", (int16_t)inst->bits3.gen7_branch.jip); > } else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_BREAK || > inst->header.opcode == GEN_OPCODE_CONTINUE || > - inst->header.opcode == GEN_OPCODE_HALT)) { > - // XXX format (file, " %d %d", inst->bits3.break_cont.uip, > inst->bits3.break_cont.jip); > - assert(0); > - } else if (inst->header.opcode == GEN_OPCODE_JMPI) { > + inst->header.opcode == GEN_OPCODE_HALT || > + inst->header.opcode == GEN_OPCODE_BRC)) { > + format (file, " %d %d", inst->bits3.gen7_branch.jip, > inst->bits3.gen7_branch.uip); > + }/* else if (inst->header.opcode == GEN_OPCODE_JMPI) { > format (file, " %d", inst->bits3.d); > - } > + }*/ > > if (opcode[inst->header.opcode].nsrc > 0) { > pad (file, 32); > diff --git a/backend/src/backend/gen_context.cpp > b/backend/src/backend/gen_context.cpp > index c46127a..bab059b 100644 > --- a/backend/src/backend/gen_context.cpp > +++ b/backend/src/backend/gen_context.cpp > @@ -87,33 +87,29 @@ namespace gbe > const LabelIndex label = pair.first; > const int32_t insnID = pair.second; > const int32_t targetID = labelPos.find(label)->second; > - p->patchJMPI(insnID, (targetID-insnID-1) * 2); > + p->patchJMPI(insnID, (targetID - insnID) * 2); > + } > + for (auto pair : branchPos3) { > + const LabelPair labelPair = pair.first; > + const int32_t insnID = pair.second; > + const int32_t jip = labelPos.find(labelPair.l0)->second + > labelPair.offset0; > + const int32_t uip = labelPos.find(labelPair.l1)->second + > labelPair.offset1; > + assert((jip - insnID) * 2 < 32767 && (jip - insnID) > -32768); > + assert((uip - insnID) * 2 < 32767 && (uip - insnID) > -32768); > >>>>>>>>> should be (uip - insnID) * 2 > -32768 Good catch. Will fix it latter. Thanks.
> > > > + p->patchJMPI(insnID, (((uip - insnID) * 2) << 16) | ((jip - insnID) * > 2)); > } > } > > void GenContext::clearFlagRegister(void) { > // when group size not aligned to simdWidth, flag register need clear to > // make prediction(any8/16h) work correctly > - const GenRegister emaskReg = > ra->genReg(GenRegister::uw1grf(ir::ocl::emask)); > - const GenRegister notEmaskReg = > ra->genReg(GenRegister::uw1grf(ir::ocl::notemask)); > - uint32_t execWidth = p->curr.execWidth; > + const GenRegister blockip = > ra->genReg(GenRegister::uw8grf(ir::ocl::blockip)); > p->push(); > - p->curr.predicate = GEN_PREDICATE_NONE; > - p->curr.noMask = 1; > - /* clear all the bit in f0.0. */ > - p->curr.execWidth = 1; > - p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW), > GenRegister::immuw(0x0000)); > - /* clear the barrier mask bits to all zero0*/ > - p->curr.noMask = 0; > - p->curr.useFlag(0, 0); > - p->curr.execWidth = execWidth; > - /* set all the active lane to 1. Inactive lane remains 0. */ > - p->CMP(GEN_CONDITIONAL_EQ, GenRegister::ud16grf(126, 0), > GenRegister::ud16grf(126, 0)); > - p->curr.noMask = 1; > - p->curr.execWidth = 1; > - p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0), > GEN_TYPE_UW)); > - p->XOR(notEmaskReg, emaskReg, GenRegister::immuw(0xFFFF)); > - p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), > notEmaskReg); > + p->curr.noMask = 1; > + p->curr.predicate = GEN_PREDICATE_NONE; > + p->MOV(blockip, GenRegister::immuw(GEN_MAX_LABEL)); > + p->curr.noMask = 0; > + p->MOV(blockip, GenRegister::immuw(0)); > p->pop(); > } > > @@ -148,7 +144,6 @@ namespace gbe > // Check that everything is consistent in the kernel code > const uint32_t perLaneSize = kernel->getStackSize(); > const uint32_t perThreadSize = perLaneSize * this->simdWidth; > - //const int32_t offset = GEN_REG_SIZE + > kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, > GBE_STACK_BUFFER); > GBE_ASSERT(perLaneSize > 0); > GBE_ASSERT(isPowerOf<2>(perLaneSize) == true); > GBE_ASSERT(isPowerOf<2>(perThreadSize) == true); > @@ -325,6 +320,7 @@ namespace gbe > for (int i = 0; i < w / 8; i ++) { > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), > src0, src1); > p->curr.accWrEnable = 1; > p->MACH(tmp, src0, src1); > @@ -500,6 +496,7 @@ namespace gbe > int execWidth = p->curr.execWidth; > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->curr.execWidth = 8; > for (int nib = 0; nib < execWidth / 4; nib ++) { > p->AND(dest, src.bottom_half(), GenRegister::immud(63)); > @@ -539,6 +536,7 @@ namespace gbe > void GenContext::I64ABS(GenRegister sign, GenRegister high, GenRegister > low, GenRegister tmp, GenRegister flagReg) { > p->SHR(sign, high, GenRegister::immud(31)); > p->push(); > + p->curr.noMask = 1; > p->curr.predicate = GEN_PREDICATE_NONE; > p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); > p->CMP(GEN_CONDITIONAL_NZ, sign, GenRegister::immud(0)); > @@ -574,6 +572,7 @@ namespace gbe > I64FullMult(e, f, g, h, a, b, c, d); > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); > p->CMP(GEN_CONDITIONAL_NZ, i, GenRegister::immud(0)); > p->curr.predicate = GEN_PREDICATE_NORMAL; > @@ -626,6 +625,7 @@ namespace gbe > p->OR(a, e, f); > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); > p->CMP(GEN_CONDITIONAL_NZ, a, zero); > p->curr.predicate = GEN_PREDICATE_NORMAL; > @@ -639,6 +639,7 @@ namespace gbe > I64FullMult(e, f, g, h, a, b, c, d); > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); > p->CMP(GEN_CONDITIONAL_NZ, i, zero); > p->curr.predicate = GEN_PREDICATE_NORMAL; > @@ -670,6 +671,7 @@ namespace gbe > p->push(); > p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->CMP(GEN_CONDITIONAL_NZ, e, zero); > p->curr.predicate = GEN_PREDICATE_NORMAL; > p->MOV(b, one); > @@ -793,6 +795,7 @@ namespace gbe > case SEL_OP_I64SHL: > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > collectShifter(a, y); > loadBottomHalf(e, x); > loadTopHalf(f, x); > @@ -820,6 +823,7 @@ namespace gbe > case SEL_OP_I64SHR: > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > collectShifter(a, y); > loadBottomHalf(e, x); > loadTopHalf(f, x); > @@ -848,6 +852,7 @@ namespace gbe > f.type = GEN_TYPE_D; > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > collectShifter(a, y); > loadBottomHalf(e, x); > loadTopHalf(f, x); > @@ -894,6 +899,7 @@ namespace gbe > p->push(); > p->curr.useFlag(flag.flag_nr(), flag.flag_subnr()); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(32)); > //high == 0 > p->curr.predicate = GEN_PREDICATE_NORMAL; > p->MOV(dst, low); > @@ -911,6 +917,7 @@ namespace gbe > p->pop(); > > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->CMP(GEN_CONDITIONAL_G, exp, GenRegister::immud(23)); > p->curr.predicate = GEN_PREDICATE_NORMAL; > p->CMP(GEN_CONDITIONAL_L, exp, GenRegister::immud(32)); > //exp>23 && high!=0 > @@ -936,6 +943,7 @@ namespace gbe > p->pop(); > > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(23)); > p->curr.predicate = GEN_PREDICATE_NORMAL; > p->MOV(dst_ud, GenRegister::immud(0)); //exp==9, SHR == 0 > @@ -956,7 +964,7 @@ namespace gbe > p->SHL(high, low, tmp); > p->MOV(low, GenRegister::immud(0)); > > - p->patchJMPI(jip1, (p->n_instruction() - (jip1 + 1)) * 2); > + p->patchJMPI(jip1, (p->n_instruction() - jip1) * 2); > p->curr.predicate = GEN_PREDICATE_NONE; > p->CMP(GEN_CONDITIONAL_LE, exp, GenRegister::immud(31)); > //update dst where high != 0 > p->curr.predicate = GEN_PREDICATE_NORMAL; > @@ -970,7 +978,7 @@ namespace gbe > p->CMP(GEN_CONDITIONAL_EQ, high, > GenRegister::immud(0x80000000)); > p->CMP(GEN_CONDITIONAL_EQ, low, GenRegister::immud(0x0)); > p->AND(dst_ud, dst_ud, GenRegister::immud(0xfffffffe)); > - p->patchJMPI(jip0, (p->n_instruction() - (jip0 + 1)) * 2); > + p->patchJMPI(jip0, (p->n_instruction() - jip0) * 2); > > p->pop(); > > @@ -994,6 +1002,7 @@ namespace gbe > p->MOV(tmp_high, high); > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->curr.useFlag(f0.flag_nr(), f0.flag_subnr()); > p->CMP(GEN_CONDITIONAL_GE, tmp_high, > GenRegister::immud(0x80000000)); > p->curr.predicate = GEN_PREDICATE_NORMAL; > @@ -1006,6 +1015,7 @@ namespace gbe > UnsignedI64ToFloat(dest, high, low, exp, mantissa, tmp, f0); > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->curr.useFlag(f0.flag_nr(), f0.flag_subnr()); > p->CMP(GEN_CONDITIONAL_GE, tmp_high, > GenRegister::immud(0x80000000)); > p->curr.predicate = GEN_PREDICATE_NORMAL; > @@ -1039,6 +1049,7 @@ namespace gbe > if(dst.is_signed_int()) { > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->curr.useFlag(flag0.flag_nr(), flag0.flag_subnr()); > p->CMP(GEN_CONDITIONAL_L, src, GenRegister::immf(0x0)); > p->curr.predicate = GEN_PREDICATE_NORMAL; > @@ -1066,11 +1077,10 @@ namespace gbe > f1.width = GEN_WIDTH_1; > GenRegister f2 = GenRegister::suboffset(f1, 1); > GenRegister f3 = GenRegister::suboffset(f1, 2); > - GenRegister f4 = GenRegister::suboffset(f1, 3); > > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > - saveFlag(f4, flag, subFlag); > + p->curr.noMask = 1; > loadTopHalf(tmp0, src0); > loadTopHalf(tmp1, src1); > switch(insn.extra.function) { > @@ -1130,12 +1140,13 @@ namespace gbe > NOT_IMPLEMENTED; > } > p->curr.execWidth = 1; > - p->AND(f1, f1, f4); > p->MOV(GenRegister::flag(flag, subFlag), f1); > p->pop(); > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->MOV(dst, GenRegister::immd(0)); > + p->curr.noMask = 0; > p->curr.predicate = GEN_PREDICATE_NORMAL; > p->MOV(dst, GenRegister::immd(-1)); > p->pop(); > @@ -1163,6 +1174,7 @@ namespace gbe > p->ADD(c, c, d); > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); > if(! dst.is_signed_int()) { > p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0)); > @@ -1176,6 +1188,7 @@ namespace gbe > p->MOV(a, GenRegister::immud(0x80000000u)); > p->MOV(b, GenRegister::immud(0)); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(0)); > p->curr.predicate = GEN_PREDICATE_NORMAL; > p->CMP(GEN_CONDITIONAL_GE, a, > GenRegister::immud(0x80000000u)); > @@ -1209,6 +1222,7 @@ namespace gbe > p->ADD(c, c, d); > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); > if(! dst.is_signed_int()) { > p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0)); > @@ -1238,6 +1252,7 @@ namespace gbe > src = src.top_half(); > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->curr.execWidth = 8; > p->MOV(dest, src); > p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, > 4)); > @@ -1252,6 +1267,7 @@ namespace gbe > int execWidth = p->curr.execWidth; > dest = dest.top_half(); > p->push(); > + p->curr.predicate = GEN_PREDICATE_NORMAL; > p->curr.execWidth = 8; > p->MOV(dest, src); > p->curr.nibControl = 1; > @@ -1271,6 +1287,7 @@ namespace gbe > src = src.bottom_half(); > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->curr.execWidth = 8; > p->MOV(dest, src); > p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, > 4)); > @@ -1286,6 +1303,7 @@ namespace gbe > dest = dest.bottom_half(); > p->push(); > p->curr.execWidth = 8; > + p->curr.predicate = GEN_PREDICATE_NORMAL; > p->MOV(dest, src); > p->curr.nibControl = 1; > p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, > 4)); > @@ -1369,6 +1387,7 @@ namespace gbe > loadBottomHalf(d, y); > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), e, b, > c); > I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), f, a, > d); > p->ADD(e, e, f); > @@ -1443,6 +1462,7 @@ namespace gbe > // condition <- (c,d)==0 && (a,b)>=(e,f) > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->MOV(l, zero); > p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); > p->CMP(GEN_CONDITIONAL_EQ, a, e); > @@ -1477,6 +1497,7 @@ namespace gbe > p->ADD(m, m, one); > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); > p->CMP(GEN_CONDITIONAL_L, m, GenRegister::immud(64)); > > @@ -1484,7 +1505,6 @@ namespace gbe > p->curr.noMask = 1; > p->AND(flagReg, flagReg, emaskReg); > > - p->curr.predicate = GEN_PREDICATE_NORMAL; > // under condition, jump back to start point > if (simdWidth == 8) > p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H; > @@ -1493,8 +1513,9 @@ namespace gbe > else > NOT_IMPLEMENTED; > int jip = -(int)(p->n_instruction() - loop_start + 1) * 2; > + p->curr.noMask = 1; > p->JMPI(zero); > - p->patchJMPI(p->n_instruction()-2, jip); > + p->patchJMPI(p->n_instruction() - 2, jip + 2); > p->pop(); > // end of loop > } > @@ -1502,6 +1523,7 @@ namespace gbe > if(x.is_signed_int()) { > p->push(); > p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr()); > p->CMP(GEN_CONDITIONAL_NEQ, k, zero); > p->curr.predicate = GEN_PREDICATE_NORMAL; > @@ -1534,7 +1556,7 @@ namespace gbe > } > > void GenContext::emitNoOpInstruction(const SelectionInstruction &insn) { > - NOT_IMPLEMENTED; > + p->NOP(); > } > > void GenContext::emitWaitInstruction(const SelectionInstruction &insn) { > @@ -1546,59 +1568,24 @@ namespace gbe > const GenRegister fenceDst = ra->genReg(insn.dst(0)); > uint32_t barrierType = insn.extra.barrierType; > const GenRegister barrierId = > ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid)); > - GenRegister blockIP; > - uint32_t exeWidth = p->curr.execWidth; > - ir::LabelIndex label = insn.parent->bb->getNextBlock()->getLabelIndex(); > - > - if (exeWidth == 16) > - blockIP = ra->genReg(GenRegister::uw16grf(ir::ocl::blockip)); > - else if (exeWidth == 8) > - blockIP = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip)); > > - p->push(); > - /* Set block IP to 0xFFFF and clear the flag0's all bits. to skip all the > instructions > - after the barrier, If there is any lane still remains zero. */ > - p->MOV(blockIP, GenRegister::immuw(0xFFFF)); > - p->curr.noMask = 1; > - p->curr.execWidth = 1; > - this->branchPos2.push_back(std::make_pair(label, p->n_instruction())); > - if (exeWidth == 16) > - p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H; > - else if (exeWidth == 8) > - p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H; > - else > - NOT_IMPLEMENTED; > - p->curr.inversePredicate = 1; > - // If not all channel is set to 1, the barrier is still waiting for > other lanes to > complete, > - // jump to next basic block. > - p->JMPI(GenRegister::immud(0)); > - p->curr.predicate = GEN_PREDICATE_NONE; > - p->MOV(GenRegister::flag(0, 0), > ra->genReg(GenRegister::uw1grf(ir::ocl::emask))); > - p->pop(); > - > - p->push(); > - p->curr.useFlag(0, 0); > - /* Restore the blockIP to current label. */ > - p->MOV(blockIP, > GenRegister::immuw(insn.parent->bb->getLabelIndex())); > if (barrierType == ir::syncGlobalBarrier) { > p->FENCE(fenceDst); > p->MOV(fenceDst, fenceDst); > } > - p->curr.predicate = GEN_PREDICATE_NONE; > - // As only the payload.2 is used and all the other regions are ignored > - // SIMD8 mode here is safe. > - p->curr.execWidth = 8; > - p->curr.physicalFlag = 0; > - p->curr.noMask = 1; > - // Copy barrier id from r0. > - p->AND(src, barrierId, GenRegister::immud(0x0f000000)); > - // A barrier is OK to start the thread synchronization *and* SLM fence > - p->BARRIER(src); > - // Now we wait for the other threads > - p->curr.execWidth = 1; > - p->WAIT(); > - // we executed the barrier then restore the barrier soft mask to initial > value. > - p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), > ra->genReg(GenRegister::uw1grf(ir::ocl::notemask))); > + p->push(); > + // As only the payload.2 is used and all the other regions are ignored > + // SIMD8 mode here is safe. > + p->curr.execWidth = 8; > + p->curr.physicalFlag = 0; > + p->curr.noMask = 1; > + // Copy barrier id from r0. > + p->AND(src, barrierId, GenRegister::immud(0x0f000000)); > + // A barrier is OK to start the thread synchronization *and* SLM fence > + p->BARRIER(src); > + p->curr.execWidth = 1; > + // Now we wait for the other threads > + p->WAIT(); > p->pop(); > } > > diff --git a/backend/src/backend/gen_defs.hpp > b/backend/src/backend/gen_defs.hpp > index 7c49497..e731174 100644 > --- a/backend/src/backend/gen_defs.hpp > +++ b/backend/src/backend/gen_defs.hpp > @@ -896,6 +896,11 @@ struct GenInstruction > uint32_t end_of_thread:1; > } gen7_msg_gw; > > + struct { > + uint32_t jip:16; > + uint32_t uip:16; > + } gen7_branch; > + > int d; > uint32_t ud; > float f; > diff --git a/backend/src/backend/gen_encoder.cpp > b/backend/src/backend/gen_encoder.cpp > index fc7e53d..06aa769 100644 > --- a/backend/src/backend/gen_encoder.cpp > +++ b/backend/src/backend/gen_encoder.cpp > @@ -837,6 +837,7 @@ namespace gbe > GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD); > push(); > curr.predicate = GEN_PREDICATE_NONE; > + curr.noMask = 1; > curr.execWidth = 1; > MOV(r, GenRegister::immud(u.u[1])); > MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0])); > @@ -907,6 +908,7 @@ namespace gbe > push(); > curr.execWidth = 8; > curr.predicate = GEN_PREDICATE_NONE; > + curr.noMask = 1; > MOV(r0, src0); > MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 4)); > curr.predicate = GEN_PREDICATE_NORMAL; > @@ -920,6 +922,7 @@ namespace gbe > push(); > curr.execWidth = 8; > curr.predicate = GEN_PREDICATE_NONE; > + curr.noMask = 1; > MOV(r0, GenRegister::suboffset(src0, 8)); > MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, > 12)); > curr.predicate = GEN_PREDICATE_NORMAL; > @@ -1058,7 +1061,7 @@ namespace gbe > > #define ALU2_BRA(OP) \ > void GenEncoder::OP(GenRegister src) { \ > - alu2(this, GEN_OPCODE_##OP, GenRegister::null(), GenRegister::null(), > src); \ > + alu2(this, GEN_OPCODE_##OP, GenRegister::nullud(), > GenRegister::nullud(), src); \ > } > > ALU2_BRA(IF) > @@ -1071,9 +1074,21 @@ namespace gbe > GBE_ASSERT(insnID < this->store.size()); > GBE_ASSERT(insn.header.opcode == GEN_OPCODE_JMPI || > insn.header.opcode == GEN_OPCODE_BRD || > - insn.header.opcode == GEN_OPCODE_ENDIF); > - if ( jumpDistance > -32769 && jumpDistance < 32768 ) { > - this->setSrc1(&insn, GenRegister::immd(jumpDistance)); > + insn.header.opcode == GEN_OPCODE_ENDIF || > + insn.header.opcode == GEN_OPCODE_IF || > + insn.header.opcode == GEN_OPCODE_BRC); > + > + if (insn.header.opcode != GEN_OPCODE_JMPI || (jumpDistance > -32769 > && jumpDistance < 32768)) { > + int offset = 0; > + if (insn.header.opcode == GEN_OPCODE_IF) { > + this->setSrc1(&insn, GenRegister::immd(jumpDistance)); > + return; > + } > + else if (insn.header.opcode == GEN_OPCODE_JMPI) { > + offset = -2; > + /*assert(jumpDistance > -32769 && jumpDistance < > 32768);*/ > + } > + this->setSrc1(&insn, GenRegister::immd(jumpDistance + offset)); > } else if ( insn.header.predicate_control == GEN_PREDICATE_NONE ) { > // For the conditional jump distance out of S15 range, we need to use > an > // inverted jmp followed by a add ip, ip, distance to implement. > @@ -1085,10 +1100,12 @@ namespace gbe > // for all the branching instruction. And need to adjust the distance > // for those branch instruction's start point and end point contains > // this instruction. > + GenInstruction &insn2 = this->store[insnID+1]; > + GBE_ASSERT(insn2.header.opcode == GEN_OPCODE_NOP); > insn.header.opcode = GEN_OPCODE_ADD; > this->setDst(&insn, GenRegister::ip()); > this->setSrc0(&insn, GenRegister::ip()); > - this->setSrc1(&insn, GenRegister::immd((jumpDistance + 2) * 8)); > + this->setSrc1(&insn, GenRegister::immd(jumpDistance * 8)); > } else { > insn.header.predicate_inverse ^= 1; > this->setSrc1(&insn, GenRegister::immd(2)); > @@ -1099,7 +1116,7 @@ namespace gbe > insn2.header.opcode = GEN_OPCODE_ADD; > this->setDst(&insn2, GenRegister::ip()); > this->setSrc0(&insn2, GenRegister::ip()); > - this->setSrc1(&insn2, GenRegister::immd(jumpDistance * 8)); > + this->setSrc1(&insn2, GenRegister::immd((jumpDistance - 2) * 8)); > } > } > > diff --git a/backend/src/backend/gen_insn_selection.cpp > b/backend/src/backend/gen_insn_selection.cpp > index d86e04c..147c3e6 100644 > --- a/backend/src/backend/gen_insn_selection.cpp > +++ b/backend/src/backend/gen_insn_selection.cpp > @@ -76,8 +76,6 @@ > * > * Also, there is some extra kludge to handle the predicates for JMPI. > * > - * See TODO for a better idea for branching and masking > - * > * TODO: > * ===== > * > @@ -92,14 +90,9 @@ > * interesting approach which consists in traversing the dominator tree in > post > * order > * > - * About masking and branching, a much better idea (that I found later > unfortunately) > - * is to replace the use of the flag by uses of if/endif to enclose the basic > - * block. So, instead of using predication, we use auto-masking. The very > cool > - * consequence is that we can reintegrate back the structured branches. > - * Basically, we will be able to identify branches that can be mapped to > - * structured branches and mix nicely unstructured branches (which will use > - * jpmi, if/endif to mask the blocks) and structured branches (which are > pretty > - * fast) > + * We already use if/endif to enclose each basic block. We will continue to > identify > + * those blocks which could match to structured branching and use pure > structured > + * instruction to handle them completely. > */ > > #include "backend/gen_insn_selection.hpp" > @@ -320,38 +313,6 @@ namespace gbe > INLINE bool spillRegs(const SpilledRegs &spilledRegs, uint32_t > registerPool); > /*! indicate whether a register is a scalar/uniform register. */ > INLINE bool isScalarReg(const ir::Register ®) const { > -#if 0 > - printf("reg %d ", reg.value()); > - printf("uniform: %d ", getRegisterData(reg).isUniform()); > - if (ctx.getFunction().getArg(reg) != NULL) { printf("true function > arg\n"); > return true; } > - if (ctx.getFunction().getPushLocation(reg) != NULL) { printf("true push > location.\n"); return true; } > - if (reg == ir::ocl::groupid0 || > - reg == ir::ocl::groupid1 || > - reg == ir::ocl::groupid2 || > - reg == ir::ocl::barrierid || > - reg == ir::ocl::threadn || > - reg == ir::ocl::numgroup0 || > - reg == ir::ocl::numgroup1 || > - reg == ir::ocl::numgroup2 || > - reg == ir::ocl::lsize0 || > - reg == ir::ocl::lsize1 || > - reg == ir::ocl::lsize2 || > - reg == ir::ocl::gsize0 || > - reg == ir::ocl::gsize1 || > - reg == ir::ocl::gsize2 || > - reg == ir::ocl::goffset0 || > - reg == ir::ocl::goffset1 || > - reg == ir::ocl::goffset2 || > - reg == ir::ocl::workdim || > - reg == ir::ocl::emask || > - reg == ir::ocl::notemask || > - reg == ir::ocl::barriermask > - ) { > - printf("special reg.\n"); > - return true; > - } > - return false; > -#endif > const ir::RegisterData ®Data = getRegisterData(reg); > return regData.isUniform(); > } > @@ -992,7 +953,7 @@ namespace gbe > } > > void Selection::Opaque::ENDIF(Reg src, ir::LabelIndex jip) { > - SelectionInstruction *insn = this->appendInsn(SEL_OP_IF, 0, 1); > + SelectionInstruction *insn = this->appendInsn(SEL_OP_ENDIF, 0, 1); > insn->src(0) = src; > insn->index = uint16_t(jip); > } > @@ -1412,9 +1373,17 @@ namespace gbe > for (uint32_t regID = 0; regID < this->regNum; ++regID) > this->regDAG[regID] = NULL; > > + this->block->hasBarrier = false; > + this->block->hasBranch = bb.getLastInstruction()->getOpcode() == > OP_BRA || > + bb.getLastInstruction()->getOpcode() == > OP_RET; > + if (!this->block->hasBranch) > + this->block->endifOffset = -1; > + > // Build the DAG on the fly > uint32_t insnNum = 0; > const_cast<BasicBlock&>(bb).foreach([&](const Instruction &insn) { > + if (insn.getOpcode() == OP_SYNC) > + this->block->hasBarrier = true; > > // Build a selectionDAG node for instruction > SelectionDAG *dag = this->newSelectionDAG(insn); > @@ -1465,6 +1434,7 @@ namespace gbe > void Selection::Opaque::matchBasicBlock(uint32_t insnNum) > { > // Bottom up code generation > + bool needEndif = this->block->hasBranch == false > && !this->block->hasBarrier; > for (int32_t insnID = insnNum-1; insnID >= 0; --insnID) { > // Process all possible patterns for this instruction > SelectionDAG &dag = *insnDAG[insnID]; > @@ -1476,8 +1446,10 @@ namespace gbe > > // Start a new code fragment > this->startBackwardGeneration(); > + // If there is no branch at the end of this block. > > // Try all the patterns from best to worst > + > do { > if ((*it)->emit(*this, dag)) > break; > @@ -1485,6 +1457,13 @@ namespace gbe > } while (it != end); > GBE_ASSERT(it != end); > > + if (needEndif) { > + const ir::BasicBlock *curr = insn.getParent(); > + const ir::BasicBlock *next = curr->getNextBlock(); > + this->ENDIF(GenRegister::immd(0), next->getLabelIndex()); > + needEndif = false; > + } > + > // Output the code in the current basic block > this->endBackwardGeneration(); > } > @@ -2133,6 +2112,7 @@ namespace gbe > const GenRegister src1 = sel.selReg(cmpInsn.getSrc(1), type); > > sel.push(); > + sel.curr.noMask = 1; > sel.curr.predicate = GEN_PREDICATE_NONE; > sel.curr.execWidth = simdWidth; > sel.SEL_CMP(genCmp, tmp, src0, src1); > @@ -2329,7 +2309,6 @@ namespace gbe > const Type type = insn.getType(); > const Immediate imm = insn.getImmediate(); > const GenRegister dst = sel.selReg(insn.getDst(0), type); > - GenRegister flagReg; > > sel.push(); > if (sel.isScalarOrBool(insn.getDst(0)) == true) { > @@ -2371,24 +2350,10 @@ namespace gbe > { > using namespace ir; > const ir::Register reg = sel.reg(FAMILY_DWORD); > - const GenRegister barrierMask = sel.selReg(ocl::barriermask, > TYPE_BOOL); > const uint32_t params = insn.getParameters(); > > - sel.push(); > - sel.curr.predicate = GEN_PREDICATE_NONE; > - sel.curr.noMask = 1; > - sel.curr.execWidth = 1; > - sel.OR(barrierMask, GenRegister::flag(0, 0), barrierMask); > - sel.MOV(GenRegister::flag(1, 1), barrierMask); > - sel.pop(); > - > // A barrier is OK to start the thread synchronization *and* SLM fence > - sel.push(); > - //sel.curr.predicate = GEN_PREDICATE_NONE; > - sel.curr.flag = 1; > - sel.curr.subFlag = 1; > - sel.BARRIER(GenRegister::ud8grf(reg), > sel.selReg(sel.reg(FAMILY_DWORD)), params); > - sel.pop(); > + sel.BARRIER(GenRegister::ud8grf(reg), > sel.selReg(sel.reg(FAMILY_DWORD)), params); > return true; > } > > @@ -2696,7 +2661,7 @@ namespace gbe > GenRegister tmpDst; > > if (type == TYPE_BOOL || type == TYPE_U16 || type == TYPE_S16) > - tmpDst = sel.selReg(sel.reg(FAMILY_WORD), TYPE_BOOL); > + tmpDst = sel.selReg(dst, TYPE_BOOL); > else > tmpDst = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_S32); > > @@ -2724,36 +2689,23 @@ namespace gbe > sel.push(); > sel.curr.flag = 1; > sel.curr.subFlag = 1; > - sel.curr.predicate = GEN_PREDICATE_NONE; > if (type == TYPE_S64 || type == TYPE_U64) { > GenRegister tmp[3]; > for(int i=0; i<3; i++) > tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD)); > - sel.push(); > - sel.curr.execWidth = 1; > - sel.curr.noMask = 1; > - sel.MOV(GenRegister::flag(1, 1), GenRegister::flag(0, 0)); > - sel.pop(); > - sel.curr.predicate = GEN_PREDICATE_NORMAL; > sel.I64CMP(getGenCompare(opcode), src0, src1, tmp, tmpDst); > } else if(opcode == OP_ORD) { > sel.push(); > - sel.curr.execWidth = 1; > - sel.curr.noMask = 1; > - sel.MOV(GenRegister::flag(1, 1), GenRegister::flag(0, 0)); > + sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst); > + sel.curr.predicate = GEN_PREDICATE_NORMAL; > + sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst); > sel.pop(); > - sel.curr.predicate = GEN_PREDICATE_NORMAL; > - > - sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst); > - sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst); > } else > sel.CMP(getGenCompare(opcode), src0, src1, tmpDst); > sel.pop(); > > if (!(type == TYPE_BOOL || type == TYPE_U16 || type == TYPE_S16)) > sel.MOV(sel.selReg(dst, TYPE_U16), > GenRegister::unpacked_uw((ir::Register)tmpDst.value.reg)); > - else > - sel.MOV(sel.selReg(dst, TYPE_U16), tmpDst); > return true; > } > }; > @@ -2979,11 +2931,6 @@ namespace gbe > markAllChildren(dag); > } > > - // Since we cannot predicate the select instruction with our current > mask, > - // we need to perform the selection in two steps (one to select, one to > - // update the destination register) > - const RegisterFamily family = getFamily(type); > - const GenRegister tmp = sel.selReg(sel.reg(family), type); > const uint32_t simdWidth = sel.ctx.getSimdWidth(); > const Register pred = insn.getPredicate(); > sel.push(); > @@ -2992,16 +2939,14 @@ namespace gbe > sel.curr.flag = 1; > sel.curr.subFlag = 1; > sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), > GenRegister::immuw(0)); > - sel.curr.noMask = 0; > + //sel.curr.noMask = 0; > sel.curr.predicate = GEN_PREDICATE_NORMAL; > if(type == ir::TYPE_S64 || type == ir::TYPE_U64) > - sel.SEL_INT64(tmp, src0, src1); > + sel.SEL_INT64(dst, src0, src1); > else > - sel.SEL(tmp, src0, src1); > + sel.SEL(dst, src0, src1); > sel.pop(); > > - // Update the destination register properly now > - sel.MOV(dst, tmp); > return true; > } > }; > @@ -3041,6 +2986,7 @@ namespace gbe > DECL_CTOR(TernaryInstruction, 1, 1); > }; > > + > /*! Label instruction pattern */ > DECL_PATTERN(LabelInstruction) > { > @@ -3053,42 +2999,75 @@ namespace gbe > const uint32_t simdWidth = sel.ctx.getSimdWidth(); > sel.LABEL(label); > > - // Do not emit any code for the "returning" block. There is no need for > it > - if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock()) > + // Do not emit any code for the "returning" block. There is no need for > it > + if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock()) > return true; > > + LabelIndex jip; > + const LabelIndex nextLabel = > insn.getParent()->getNextBlock()->getLabelIndex(); > + if (sel.ctx.hasJIP(&insn)) > + jip = sel.ctx.getLabelIndex(&insn); > + else > + jip = nextLabel; > + > // Emit the mask computation at the head of each basic block > sel.push(); > + sel.curr.noMask = 1; > sel.curr.predicate = GEN_PREDICATE_NONE; > - sel.curr.flag = 0; > - sel.curr.subFlag = 0; > sel.CMP(GEN_CONDITIONAL_LE, GenRegister::retype(src0, > GEN_TYPE_UW), src1); > sel.pop(); > > - // If it is required, insert a JUMP to bypass the block > - if (sel.ctx.hasJIP(&insn)) { > - const LabelIndex jip = sel.ctx.getLabelIndex(&insn); > + if (sel.block->hasBarrier) { > + // If this block has barrier, we don't execute the block until all > lanes > + // are 1s. Set each reached lane to 1, then check all lanes. If > there is > any > + // lane not reached, we jump to jip. And no need to issue if/endif > for > + // this block, as it will always excute with all lanes activated. > sel.push(); > - > - sel.curr.noMask = 1; > - sel.curr.execWidth = 1; > + sel.curr.predicate = GEN_PREDICATE_NORMAL; > + sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), > GenRegister::immuw(GEN_MAX_LABEL)); > sel.curr.predicate = GEN_PREDICATE_NONE; > - GenRegister emaskReg = GenRegister::uw1grf(ocl::emask); > - GenRegister flagReg = GenRegister::flag(0, 0); > - sel.AND(flagReg, flagReg, emaskReg); > - > + sel.curr.noMask = 1; > + sel.CMP(GEN_CONDITIONAL_EQ, GenRegister::retype(src0, > GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL)); > if (simdWidth == 8) > - sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H; > + sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H; > else if (simdWidth == 16) > - sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H; > + sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H; > else > NOT_IMPLEMENTED; > + sel.curr.noMask = 1; > + sel.curr.execWidth = 1; > sel.curr.inversePredicate = 1; > - sel.curr.flag = 0; > - sel.curr.subFlag = 0; > sel.JMPI(GenRegister::immd(0), jip); > sel.pop(); > + // FIXME, if the last BRA is unconditional jump, we don't need to > update the label here. > + sel.push(); > + sel.curr.predicate = GEN_PREDICATE_NORMAL; > + sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), > GenRegister::immuw((uint16_t)label)); > + sel.pop(); > + } > + else { > + if (sel.ctx.hasJIP(&insn)) { > + // If it is required, insert a JUMP to bypass the block > + sel.push(); > + if (simdWidth == 8) > + sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H; > + else if (simdWidth == 16) > + sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H; > + else > + NOT_IMPLEMENTED; > + sel.curr.noMask = 1; > + sel.curr.execWidth = 1; > + sel.curr.inversePredicate = 1; > + sel.JMPI(GenRegister::immd(0), jip); > + sel.pop(); > + } > + sel.push(); > + sel.curr.predicate = GEN_PREDICATE_NORMAL; > + // It's easier to set the jip to a relative position over next > block. > + sel.IF(GenRegister::immd(0), nextLabel, nextLabel, > sel.block->endifOffset, sel.block->endifOffset); > + sel.pop(); > } > + > return true; > } > DECL_CTOR(LabelInstruction, 1, 1); > @@ -3225,7 +3204,6 @@ namespace gbe > /*! Branch instruction pattern */ > DECL_PATTERN(BranchInstruction) > { > - > void emitForwardBranch(Selection::Opaque &sel, > const ir::BranchInstruction &insn, > ir::LabelIndex dst, > @@ -3233,16 +3211,13 @@ namespace gbe > { > using namespace ir; > const GenRegister ip = sel.selReg(ocl::blockip, TYPE_U16); > - const LabelIndex jip = sel.ctx.getLabelIndex(&insn); > > // We will not emit any jump if we must go the next block anyway > const BasicBlock *curr = insn.getParent(); > const BasicBlock *next = curr->getNextBlock(); > const LabelIndex nextLabel = next->getLabelIndex(); > - > if (insn.isPredicated() == true) { > const Register pred = insn.getPredicateIndex(); > - > sel.push(); > // we don't need to set next label to the pcip > // as if there is no backward jump latter, then obviously > everything will work fine. > @@ -3250,22 +3225,30 @@ namespace gbe > sel.curr.flag = 0; > sel.curr.subFlag = 0; > sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), > GenRegister::immuw(0)); > + sel.curr.predicate = GEN_PREDICATE_NORMAL; > sel.MOV(ip, GenRegister::immuw(uint16_t(dst))); > + if (!sel.block->hasBarrier) > + sel.ENDIF(GenRegister::immd(0), nextLabel); > + sel.block->endifOffset = -1; > sel.pop(); > - > - if (nextLabel == jip) return; > } else { > // Update the PcIPs > + const LabelIndex jip = sel.ctx.getLabelIndex(&insn); > sel.MOV(ip, GenRegister::immuw(uint16_t(dst))); > - > - // Do not emit branch when we go to the next block anyway > + if (!sel.block->hasBarrier) > + sel.ENDIF(GenRegister::immd(0), nextLabel); > + sel.block->endifOffset = -1; > if (nextLabel == jip) return; > + // Branch to the jump target > sel.push(); > sel.curr.execWidth = 1; > sel.curr.noMask = 1; > sel.curr.predicate = GEN_PREDICATE_NONE; > sel.JMPI(GenRegister::immd(0), jip); > sel.pop(); > + // FIXME just for the correct endif offset. > + // JMPI still has 2 instruction. > + sel.block->endifOffset -= 2; > } > } > > @@ -3290,37 +3273,32 @@ namespace gbe > // that actually take the branch > const LabelIndex next = bb.getNextBlock()->getLabelIndex(); > sel.MOV(ip, GenRegister::immuw(uint16_t(next))); > - > + GBE_ASSERT(jip == dst); > sel.push(); > sel.curr.flag = 0; > sel.curr.subFlag = 0; > + sel.curr.predicate = GEN_PREDICATE_NONE; > sel.CMP(GEN_CONDITIONAL_NEQ, sel.selReg(pred, TYPE_U16), > GenRegister::immuw(0)); > - // Re-update the PcIPs for the branches that takes the backward > jump > + sel.curr.predicate = GEN_PREDICATE_NORMAL; > sel.MOV(ip, GenRegister::immuw(uint16_t(dst))); > - > - // We clear all the inactive channel to 0 as the > GEN_PREDICATE_ALIGN1_ANY8/16 > - // will check those bits as well. > sel.curr.predicate = GEN_PREDICATE_NONE; > + if (!sel.block->hasBarrier) > + sel.ENDIF(GenRegister::immd(0), next); > sel.curr.execWidth = 1; > - sel.curr.noMask = 1; > - GenRegister emaskReg = GenRegister::uw1grf(ocl::emask); > - sel.AND(GenRegister::flag(0, 1), GenRegister::flag(0, 1), > emaskReg); > - > - // Branch to the jump target > - if (simdWidth == 8) > - sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H; > - else if (simdWidth == 16) > + if (simdWidth == 16) > sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H; > else > - NOT_SUPPORTED; > + sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H; > + sel.curr.noMask = 1; > sel.JMPI(GenRegister::immd(0), jip); > + sel.block->endifOffset = -3; > sel.pop(); > - > } else { > - > + const LabelIndex next = bb.getNextBlock()->getLabelIndex(); > // Update the PcIPs > sel.MOV(ip, GenRegister::immuw(uint16_t(dst))); > - > + if (!sel.block->hasBarrier) > + sel.ENDIF(GenRegister::immd(0), next); > // Branch to the jump target > sel.push(); > sel.curr.execWidth = 1; > @@ -3328,6 +3306,7 @@ namespace gbe > sel.curr.predicate = GEN_PREDICATE_NONE; > sel.JMPI(GenRegister::immd(0), jip); > sel.pop(); > + sel.block->endifOffset = -3; > } > } > > diff --git a/backend/src/backend/gen_insn_selection.hpp > b/backend/src/backend/gen_insn_selection.hpp > index 04fbb9f..8557768 100644 > --- a/backend/src/backend/gen_insn_selection.hpp > +++ b/backend/src/backend/gen_insn_selection.hpp > @@ -42,6 +42,8 @@ namespace gbe > /*! Translate IR compare to Gen compare */ > uint32_t getGenCompare(ir::Opcode opcode); > > + #define GEN_MAX_LABEL 0xFFFF > + > /*! Selection opcodes properly encoded from 0 to n for fast jump tables > * generations > */ > @@ -190,6 +192,9 @@ namespace gbe > void append(SelectionInstruction *insn); > /*! Append a new selection instruction at the beginning of the block */ > void prepend(SelectionInstruction *insn); > + int endifOffset; > + bool hasBarrier; > + bool hasBranch; > }; > > /*! Owns the selection engine */ > diff --git a/backend/src/backend/gen_insn_selection.hxx > b/backend/src/backend/gen_insn_selection.hxx > index d318f8e..ddc9d5e 100644 > --- a/backend/src/backend/gen_insn_selection.hxx > +++ b/backend/src/backend/gen_insn_selection.hxx > @@ -80,7 +80,7 @@ DECL_SELECTION_IR(CONVI64_TO_I, UnaryInstruction) > DECL_SELECTION_IR(CONVI64_TO_F, I64ToFloatInstruction) > DECL_SELECTION_IR(CONVF_TO_I64, FloatToI64Instruction) > DECL_SELECTION_IR(I64MADSAT, I64MADSATInstruction) > -DECL_SELECTION_IR(BRC, BinaryInstruction) > +DECL_SELECTION_IR(BRC, UnaryInstruction) > DECL_SELECTION_IR(BRD, UnaryInstruction) > DECL_SELECTION_IR(IF, UnaryInstruction) > DECL_SELECTION_IR(ENDIF, UnaryInstruction) > diff --git a/backend/src/backend/gen_register.hpp > b/backend/src/backend/gen_register.hpp > index 25cb428..051f16d 100644 > --- a/backend/src/backend/gen_register.hpp > +++ b/backend/src/backend/gen_register.hpp > @@ -118,7 +118,7 @@ namespace gbe > this->noMask = 0; > this->flag = 0; > this->subFlag = 0; > - this->predicate = GEN_PREDICATE_NORMAL; > + this->predicate = GEN_PREDICATE_NONE; > this->inversePredicate = 0; > this->physicalFlag = 1; > this->flagIndex = 0; > @@ -657,6 +657,17 @@ namespace gbe > GEN_HORIZONTAL_STRIDE_1); > } > > + static INLINE GenRegister nullud(void) { > + return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE, > + GEN_ARF_NULL, > + 0, > + GEN_TYPE_UD, > + GEN_VERTICAL_STRIDE_8, > + GEN_WIDTH_8, > + GEN_HORIZONTAL_STRIDE_1); > + } > + > + > static INLINE bool isNull(GenRegister reg) { > return (reg.file == GEN_ARCHITECTURE_REGISTER_FILE > && reg.nr == GEN_ARF_NULL); > diff --git a/src/cl_api.c b/src/cl_api.c > index 9638994..b572500 100644 > --- a/src/cl_api.c > +++ b/src/cl_api.c > @@ -2526,6 +2526,7 @@ clEnqueueNDRangeKernel(cl_command_queue > command_queue, > > err = cl_command_queue_flush(command_queue); > } > + clFinish(command_queue); > > error: > return err; > diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c > index 5e474de..c2308da 100644 > --- a/src/intel/intel_driver.c > +++ b/src/intel/intel_driver.c > @@ -135,6 +135,7 @@ intel_driver_memman_init(intel_driver_t *driver) > { > driver->bufmgr = drm_intel_bufmgr_gem_init(driver->fd, BATCH_SIZE); > assert(driver->bufmgr); > + //drm_intel_bufmgr_gem_set_aub_dump(driver->bufmgr, 1); > drm_intel_bufmgr_gem_enable_reuse(driver->bufmgr); > } > > diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c > index e95b050..4819e9e 100644 > --- a/src/intel/intel_gpgpu.c > +++ b/src/intel/intel_gpgpu.c > @@ -695,7 +695,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, > cl_gpgpu_kernel *kernel) > memset(desc, 0, sizeof(*desc)); > ker_bo = (drm_intel_bo *) kernel->bo; > desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */ > - desc->desc1.single_program_flow = 1; > + desc->desc1.single_program_flow = 0; > desc->desc1.floating_point_mode = 0; /* use IEEE-754 rule */ > desc->desc5.rounding_mode = 0; /* round to nearest even */ > > diff --git a/utests/compiler_long_cmp.cpp b/utests/compiler_long_cmp.cpp > index 35d4c4f..f901fdf 100644 > --- a/utests/compiler_long_cmp.cpp > +++ b/utests/compiler_long_cmp.cpp > @@ -45,6 +45,7 @@ void compiler_long_cmp(void) > int64_t *dest = (int64_t *)buf_data[2]; > int64_t x = (src1[i] < src2[i]) ? 3 : 4; > OCL_ASSERT(x == dest[i]); > + //printf("%d %ld %ld \n", i, dest[i], x); > } > OCL_UNMAP_BUFFER(2); > OCL_DESTROY_KERNEL_KEEP_PROGRAM(true); > diff --git a/utests/compiler_unstructured_branch0.cpp > b/utests/compiler_unstructured_branch0.cpp > index 128a53e..1a371e9 100644 > --- a/utests/compiler_unstructured_branch0.cpp > +++ b/utests/compiler_unstructured_branch0.cpp > @@ -27,7 +27,6 @@ static void compiler_unstructured_branch0(void) > OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2); > for (uint32_t i = 16; i < 32; ++i) > OCL_ASSERT(((int32_t*)buf_data[1])[i] == 1); > - > // Second control flow > for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2; > OCL_UNMAP_BUFFER(0); > @@ -36,8 +35,7 @@ static void compiler_unstructured_branch0(void) > OCL_MAP_BUFFER(0); > OCL_MAP_BUFFER(1); > for (uint32_t i = 0; i < 32; ++i) > - OCL_ASSERT(((int32_t*)buf_data[1])[i] == 1); > - > + OCL_ASSERT(((int32_t*)buf_data[1])[i] == 1); > // Third control flow > for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2; > OCL_UNMAP_BUFFER(0); > diff --git a/utests/compiler_unstructured_branch1.cpp > b/utests/compiler_unstructured_branch1.cpp > index 6021f5b..fb24cec 100644 > --- a/utests/compiler_unstructured_branch1.cpp > +++ b/utests/compiler_unstructured_branch1.cpp > @@ -25,7 +25,6 @@ static void compiler_unstructured_branch1(void) > OCL_MAP_BUFFER(1); > for (uint32_t i = 0; i < n; ++i) > OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2); > - > // Second control flow > for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2; > OCL_UNMAP_BUFFER(0); > @@ -34,7 +33,7 @@ static void compiler_unstructured_branch1(void) > OCL_MAP_BUFFER(0); > OCL_MAP_BUFFER(1); > for (uint32_t i = 0; i < n; ++i) > - OCL_ASSERT(((uint32_t*)buf_data[1])[i] == 3); > + OCL_ASSERT(((uint32_t*)buf_data[1])[i] == 3); > > // Third control flow > for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2; > diff --git a/utests/compiler_unstructured_branch2.cpp > b/utests/compiler_unstructured_branch2.cpp > index d61c6b5..68c7448 100644 > --- a/utests/compiler_unstructured_branch2.cpp > +++ b/utests/compiler_unstructured_branch2.cpp > @@ -23,6 +23,7 @@ static void compiler_unstructured_branch2(void) > // First control flow > OCL_MAP_BUFFER(0); > OCL_MAP_BUFFER(1); > +#if 1 > for (uint32_t i = 0; i < n; ++i) > OCL_ASSERT(((int32_t*)buf_data[1])[i] == 12); > > @@ -35,7 +36,7 @@ static void compiler_unstructured_branch2(void) > OCL_MAP_BUFFER(1); > for (uint32_t i = 0; i < n; ++i) > OCL_ASSERT(((int32_t*)buf_data[1])[i] == -6); > - > +#endif > // Third control flow > for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2; > for (uint32_t i = 8; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2; > @@ -45,9 +46,16 @@ static void compiler_unstructured_branch2(void) > OCL_MAP_BUFFER(0); > OCL_MAP_BUFFER(1); > for (uint32_t i = 0; i < 8; ++i) > + { > + //printf("%d: %d %d\n", i, ((int32_t*)buf_data[1])[i], 12); > OCL_ASSERT(((int32_t*)buf_data[1])[i] == 12); > + } > for (uint32_t i = 8; i < n; ++i) > + { > + //printf("%d: %d %d\n", i, ((int32_t*)buf_data[1])[i], -6); > OCL_ASSERT(((int32_t*)buf_data[1])[i] == -6); > + } > + //exit(0); > > // Fourth control flow > for (uint32_t i = 0; i < 4; ++i) ((int32_t*)buf_data[0])[i] = 1; > -- > 1.8.3.2 > > _______________________________________________ > Beignet mailing list > Beignet@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet