Should the unaligned optimization we did in vload/vstore also gone after HSW?
Thanks Zou Nanhai > -----Original Message----- > From: Beignet [mailto:[email protected]] On Behalf Of > Yang Rong > Sent: Monday, June 15, 2015 2:46 PM > To: [email protected] > Cc: Yang, Rong R > Subject: [Beignet] [PATCH 2/2] Use the Byte Gather after HSW when read > byte/shor. > > After HSW, the byte gather's performance issue has gone, so needn't read > dword and extract. > But for multi dst load, the combine reduce the address calc, but need the > extract the dst, maybe performance is approximate, so still use the old logic. > > Signed-off-by: Yang Rong <[email protected]> > --- > backend/src/backend/gen_insn_selection.cpp | 36 > ++++++++++++++++++++++++++++-- > 1 file changed, 34 insertions(+), 2 deletions(-) > > diff --git a/backend/src/backend/gen_insn_selection.cpp > b/backend/src/backend/gen_insn_selection.cpp > index d63c7e3..d289e8e 100644 > --- a/backend/src/backend/gen_insn_selection.cpp > +++ b/backend/src/backend/gen_insn_selection.cpp > @@ -365,6 +365,8 @@ namespace gbe > void setLongRegRestrict(bool b) { bLongRegRestrict = b; } > void setLdMsgOrder(uint32_t type) { ldMsgOrder = type; } > uint32_t getLdMsgOrder() const { return ldMsgOrder; } > + void setSlowByteGather(bool b) { slowByteGather = b; } > + bool getSlowByteGather() { return slowByteGather; } > /*! indicate whether a register is a scalar/uniform register. */ > INLINE bool isPartialWrite(const ir::Register ®) const { > return partialWriteRegs.find(reg.value()) != partialWriteRegs.end(); > @@ -740,6 +742,7 @@ namespace gbe > bool bHasLongType; > bool bLongRegRestrict; > uint32_t ldMsgOrder; > + bool slowByteGather; > INLINE ir::LabelIndex newAuxLabel() > { > currAuxLabel++; > @@ -779,7 +782,8 @@ namespace gbe > curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()), > maxInsnNum(ctx.getFunction().getLargestBlockSize()), > dagPool(maxInsnNum), > stateNum(0), vectorNum(0), bwdCodeGeneration(false), > currAuxLabel(ctx.getFunction().labelNum()), > - bHas32X32Mul(false), bHasLongType(false), bLongRegRestrict(false), > ldMsgOrder(LD_MSG_ORDER_IVB) > + bHas32X32Mul(false), bHasLongType(false), bLongRegRestrict(false), > ldMsgOrder(LD_MSG_ORDER_IVB), > + slowByteGather(false) > { > const ir::Function &fn = ctx.getFunction(); > this->regNum = fn.regNum(); > @@ -2025,26 +2029,31 @@ namespace gbe > Selection::Selection(GenContext &ctx) { > this->blockList = NULL; > this->opaque = GBE_NEW(Selection::Opaque, ctx); > + this->opaque->setSlowByteGather(true); > } > > Selection75::Selection75(GenContext &ctx) : Selection(ctx) { > + this->opaque->setSlowByteGather(false); > } > > Selection8::Selection8(GenContext &ctx) : Selection(ctx) { > this->opaque->setHas32X32Mul(true); > this->opaque->setHasLongType(true); > + this->opaque->setSlowByteGather(false); > } > > SelectionChv::SelectionChv(GenContext &ctx) : Selection(ctx) { > this->opaque->setHas32X32Mul(true); > this->opaque->setHasLongType(true); > this->opaque->setLongRegRestrict(true); > + this->opaque->setSlowByteGather(false); > } > > Selection9::Selection9(GenContext &ctx) : Selection(ctx) { > this->opaque->setHas32X32Mul(true); > this->opaque->setHasLongType(true); > this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL); > + this->opaque->setSlowByteGather(false); > } > > void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t > msgNum, @@ -3519,8 +3528,31 @@ namespace gbe > GBE_ASSERT(insn.getValueNum() == 1); > const GenRegister value = sel.selReg(insn.getValue(0), > insn.getValueType()); > GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize > == GEN_BYTE_SCATTER_BYTE); > + if(sel.getSlowByteGather()) > + readByteAsDWord(sel, elemSize, address, value, isUniform, bti); > + else { > + GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : > sel.selReg(bti.reg, ir::TYPE_U32); > + GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD, > + true), ir::TYPE_U16); > + > + // We need a temporary register if we read bytes or words > + Register dst = sel.reg(FAMILY_DWORD, isUniform); > + sel.push(); > + if (isUniform) > + sel.curr.noMask = 1; > + sel.BYTE_GATHER(sel.selReg(dst, ir::TYPE_U32), address, > elemSize, b, bti.isConst ? NULL : & tmpFlag); > + sel.pop(); > > - readByteAsDWord(sel, elemSize, address, value, isUniform, bti); > + sel.push(); > + if (isUniform) { > + sel.curr.noMask = 1; > + sel.curr.execWidth = 1; > + } > + if (elemSize == GEN_BYTE_SCATTER_WORD) > + sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), > GenRegister::unpacked_uw(dst)); > + else if (elemSize == GEN_BYTE_SCATTER_BYTE) > + sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), > GenRegister::unpacked_ub(dst)); > + sel.pop(); > + } > } > } > > -- > 1.8.3.2 > > _______________________________________________ > Beignet mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
