Thanks for the comment. The intention of extracting flag calculation is to for another patch which is to reuse those flag register in more than one place and avoid calculate them again. The new patch is to save one DW loading when the address is aligned or is a short vector which is not crossing a DW boundary.
But the benchmark shows the new patch doesn't bring any performance gain :(. So before I can work out that new patch, I agree it's better to put these flag calculation in the getEffectByte(). On Wed, Sep 03, 2014 at 02:21:35AM +0000, Song, Ruiling wrote: > This patchset LGTM. But a typo in subject "unalgined" should be "unaligned". > And I think it is better to keep the piece of code still in > getEffectByteData() now. > > -----Original Message----- > From: Beignet [mailto:[email protected]] On Behalf Of > Zhigang Gong > Sent: Thursday, August 28, 2014 10:46 AM > To: [email protected] > Cc: Gong, Zhigang > Subject: [Beignet] [PATCH 2/2] GBE: refine the unalgined data gathering. > > Save some unecessary duplicate instructions. > > Signed-off-by: Zhigang Gong <[email protected]> > --- > backend/src/backend/gen_insn_selection.cpp | 43 > +++++++++++++++++------------- > 1 file changed, 25 insertions(+), 18 deletions(-) > > diff --git a/backend/src/backend/gen_insn_selection.cpp > b/backend/src/backend/gen_insn_selection.cpp > index 1258e54..4be6372 100644 > --- a/backend/src/backend/gen_insn_selection.cpp > +++ b/backend/src/backend/gen_insn_selection.cpp > @@ -2888,7 +2888,9 @@ namespace gbe > vector<GenRegister> &effectData, > vector<GenRegister> &tmp, > uint32_t effectDataNum, > - GenRegister addr, > + GenRegister shiftL, > + GenRegister shiftH, > + ir::Register alignedFlag, > uint32_t simdWidth) const > { > using namespace ir; > @@ -2899,25 +2901,14 @@ namespace gbe > for(uint32_t i = 0; i < effectDataNum; i++) { > GenRegister tmpH = GenRegister::udxgrf(simdWidth, > sel.reg(FAMILY_DWORD)); > GenRegister tmpL = effectData[i]; > - GenRegister shift = GenRegister::udxgrf(simdWidth, > sel.reg(FAMILY_DWORD)); > - Register shift1Reg = sel.reg(FAMILY_DWORD); > - GenRegister shift1 = GenRegister::udxgrf(simdWidth, shift1Reg); > - GenRegister factor = GenRegister::udxgrf(simdWidth, > sel.reg(FAMILY_DWORD)); > - sel.AND(shift, GenRegister::retype(addr, GEN_TYPE_UD), > GenRegister::immud(0x3)); > - sel.SHL(shift, shift, GenRegister::immud(0x3)); > - sel.SHR(tmpL, tmp[i], shift); > - sel.ADD(shift1, GenRegister::negate(shift), > GenRegister::immud(32)); > + sel.SHR(tmpL, tmp[i], shiftL); > sel.push(); > - // Only need to consider the tmpH when the shift is not 32. > - Register flag = sel.reg(FAMILY_BOOL); > - sel.curr.physicalFlag = 0; > - sel.curr.modFlag = 1; > - sel.curr.predicate = GEN_PREDICATE_NONE; > - sel.curr.flagIndex = (uint16_t)flag; > - sel.CMP(GEN_CONDITIONAL_NEQ, > GenRegister::unpacked_uw(shift1Reg), GenRegister::immuw(32), factor); > + // Only need to consider the tmpH when the addr is not aligned. > sel.curr.modFlag = 0; > + sel.curr.physicalFlag = 0; > + sel.curr.flagIndex = (uint16_t)alignedFlag; > sel.curr.predicate = GEN_PREDICATE_NORMAL; > - sel.SHL(tmpH, tmp[i + 1], shift1); > + sel.SHL(tmpH, tmp[i + 1], shiftH); > sel.OR(effectData[i], tmpL, tmpH); > sel.pop(); > } > @@ -2978,7 +2969,23 @@ namespace gbe > for(uint32_t i = 0; i < effectDataNum; i++) > effectData[i] = GenRegister::udxgrf(simdWidth, > sel.reg(FAMILY_DWORD)); > > - getEffectByteData(sel, effectData, tmp, effectDataNum, address, > simdWidth); > + Register alignedFlag = sel.reg(FAMILY_BOOL); > + GenRegister shiftL = GenRegister::udxgrf(simdWidth, > sel.reg(FAMILY_DWORD)); > + Register shiftHReg = sel.reg(FAMILY_DWORD); > + GenRegister shiftH = GenRegister::udxgrf(simdWidth, shiftHReg); > + sel.push(); > + if (simdWidth == 1) > + sel.curr.noMask = 1; > + sel.AND(shiftL, GenRegister::retype(address, GEN_TYPE_UD), > GenRegister::immud(0x3)); > + sel.SHL(shiftL, shiftL, GenRegister::immud(0x3)); > + sel.ADD(shiftH, GenRegister::negate(shiftL), > GenRegister::immud(32)); > + sel.curr.physicalFlag = 0; > + sel.curr.modFlag = 1; > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.flagIndex = (uint16_t)alignedFlag; > + sel.CMP(GEN_CONDITIONAL_NEQ, GenRegister::unpacked_uw(shiftHReg), > GenRegister::immuw(32)); > + sel.pop(); > + getEffectByteData(sel, effectData, tmp, effectDataNum, shiftL, > shiftH, alignedFlag, simdWidth); > > for(uint32_t i = 0; i < effectDataNum; i++) { > unsigned int elemNum = (valueNum - i * (4 / typeSize)) > > 4/typeSize ? > -- > 1.8.3.2 > > _______________________________________________ > Beignet mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/beignet > _______________________________________________ > Beignet mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
