currently, the unaligned bytes gather is implemented with readByteAsDWord for a good performance, change back to native byte scattered read for new GPU generations with better performance.
as for vload16(1/2/.../15, global char*), the native byte scattered read is not good, so use the original method for vload16. Signed-off-by: Guo Yejun <yejun....@intel.com> --- backend/src/backend/gen_insn_selection.cpp | 120 +++++++++++++++++++---------- backend/src/backend/gen_register.hpp | 9 ++- 2 files changed, 86 insertions(+), 43 deletions(-) diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 001a3c5..41acd91 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -2283,7 +2283,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp this->opaque->setHasLongType(true); this->opaque->setHasDoubleType(true); this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL); - this->opaque->setSlowByteGather(true); + this->opaque->setSlowByteGather(false); this->opaque->setHasHalfType(true); opt_features = SIOF_LOGICAL_SRCMOD; } @@ -2294,7 +2294,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp this->opaque->setLongRegRestrict(true); this->opaque->setHasDoubleType(true); this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL); - this->opaque->setSlowByteGather(true); + this->opaque->setSlowByteGather(false); this->opaque->setHasHalfType(true); opt_features = SIOF_LOGICAL_SRCMOD | SIOF_OP_MOV_LONG_REG_RESTRICT; } @@ -3784,53 +3784,91 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp if(valueNum > 1) { GBE_ASSERT(!isUniform && "vector load should not be uniform. Something went wrong."); - vector<GenRegister> dst(valueNum); - const uint32_t typeSize = getFamilySize(family); + //need to investigate the case of GEN_BYTE_SCATTER_WORD later + if(sel.getSlowByteGather() || elemSize == GEN_BYTE_SCATTER_WORD + || (elemSize == GEN_BYTE_SCATTER_BYTE && valueNum == 16)) { + vector<GenRegister> dst(valueNum); + const uint32_t typeSize = getFamilySize(family); - for(uint32_t i = 0; i < valueNum; i++) - dst[i] = sel.selReg(insn.getValue(i), getType(family)); + for(uint32_t i = 0; i < valueNum; i++) + dst[i] = sel.selReg(insn.getValue(i), getType(family)); - uint32_t effectDataNum = (typeSize*valueNum + 3) / 4; - vector<GenRegister> tmp(effectDataNum + 1); - vector<GenRegister> tmp2(effectDataNum + 1); - vector<GenRegister> effectData(effectDataNum); - for(uint32_t i = 0; i < effectDataNum + 1; i++) - tmp2[i] = tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32); + uint32_t effectDataNum = (typeSize*valueNum + 3) / 4; + vector<GenRegister> tmp(effectDataNum + 1); + vector<GenRegister> tmp2(effectDataNum + 1); + vector<GenRegister> effectData(effectDataNum); + for(uint32_t i = 0; i < effectDataNum + 1; i++) + tmp2[i] = tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32); - GenRegister alignedAddr = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32); - sel.push(); - if (isUniform) - sel.curr.noMask = 1; - sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UD), GenRegister::immud(~0x3)); - sel.pop(); + GenRegister alignedAddr = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32); + sel.push(); + if (isUniform) + sel.curr.noMask = 1; + sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UD), GenRegister::immud(~0x3)); + sel.pop(); - uint32_t remainedReg = effectDataNum + 1; - uint32_t pos = 0; - do { - uint32_t width = remainedReg > 4 ? 4 : remainedReg; - vector<GenRegister> t1(tmp.begin() + pos, tmp.begin() + pos + width); - vector<GenRegister> t2(tmp2.begin() + pos, tmp2.begin() + pos + width); - if (pos != 0) { - sel.push(); - if (isUniform) - sel.curr.noMask = 1; - sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4)); - sel.pop(); + uint32_t remainedReg = effectDataNum + 1; + uint32_t pos = 0; + do { + uint32_t width = remainedReg > 4 ? 4 : remainedReg; + vector<GenRegister> t1(tmp.begin() + pos, tmp.begin() + pos + width); + vector<GenRegister> t2(tmp2.begin() + pos, tmp2.begin() + pos + width); + if (pos != 0) { + sel.push(); + if (isUniform) + sel.curr.noMask = 1; + sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4)); + sel.pop(); + } + readDWord(sel, t1, alignedAddr, width, bti); + remainedReg -= width; + pos += width; + } while(remainedReg); + + for(uint32_t i = 0; i < effectDataNum; i++) + effectData[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32); + + getEffectByteData(sel, effectData, tmp, effectDataNum, address, isUniform); + + for(uint32_t i = 0; i < effectDataNum; i++) { + unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ? + 4/typeSize : (valueNum - i * (4 / typeSize)); + sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, effectData[i], typeSize, elemNum); + } + } else { + GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_BYTE); + GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32); + vector<GenRegister> dst(valueNum); + for(uint32_t i = 0; i < valueNum; i++) + dst[i] = sel.selReg(insn.getValue(i), getType(family)); + + Register readDst = sel.reg(FAMILY_DWORD); + uint32_t valueIndex = 0; + uint32_t loopCount = (valueNum + 3) / 4; + GenRegister addressForLoop = address; + + sel.push(); + if (loopCount > 1) { + addressForLoop = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32); + sel.MOV(addressForLoop, address); } - readDWord(sel, t1, alignedAddr, width, bti); - remainedReg -= width; - pos += width; - } while(remainedReg); - for(uint32_t i = 0; i < effectDataNum; i++) - effectData[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32); + for (uint32_t i = 0; i < loopCount; ++i) { + uint32_t dataSize = 2; // 4 bytes + GBE_ASSERT(valueNum - valueIndex > 1); + if (valueNum - valueIndex == 2) + dataSize = 1; // 2 bytes + sel.BYTE_GATHER(sel.selReg(readDst, ir::TYPE_U32), addressForLoop, dataSize, b, sel.getBTITemps(bti)); - getEffectByteData(sel, effectData, tmp, effectDataNum, address, isUniform); + //mov bytes from read destination to the real destination, 4 bytes at most. + for (uint32_t j = 0; j < 4 && valueIndex < valueNum; ++j) + sel.MOV(GenRegister::retype(dst[valueIndex++], GEN_TYPE_UB), GenRegister::unpacked_ub(readDst, false, j)); - for(uint32_t i = 0; i < effectDataNum; i++) { - unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ? - 4/typeSize : (valueNum - i * (4 / typeSize)); - sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, effectData[i], typeSize, elemNum); + //calculate the new address to read + if (valueIndex < valueNum) + sel.ADD(addressForLoop, addressForLoop, GenRegister::immud(4)); + } + sel.pop(); } } else { GBE_ASSERT(insn.getValueNum() == 1); diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp index bbea761..74db01e 100644 --- a/backend/src/backend/gen_register.hpp +++ b/backend/src/backend/gen_register.hpp @@ -585,13 +585,18 @@ namespace gbe GEN_TYPE_UW, vstride, width, hstride); } - static INLINE GenRegister unpacked_ub(ir::Register reg, bool uniform = false) { - return GenRegister(GEN_GENERAL_REGISTER_FILE, + static INLINE GenRegister unpacked_ub(ir::Register reg, bool uniform = false, uint32_t subnr = 0) { + GenRegister ub = GenRegister(GEN_GENERAL_REGISTER_FILE, reg, GEN_TYPE_UB, uniform ? GEN_VERTICAL_STRIDE_0 : GEN_VERTICAL_STRIDE_32, uniform ? GEN_WIDTH_1 : GEN_WIDTH_8, uniform ? GEN_HORIZONTAL_STRIDE_0 : GEN_HORIZONTAL_STRIDE_4); + if (subnr > 0) { + ub.subnr = subnr; + ub.subphysical = 1; + } + return ub; } static INLINE GenRegister imm(uint32_t type) { -- 1.9.1 _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet