Previous restrication is that the vector size must be multiple of DWORD. This restrication prevent the vload2/3 of char or vload3 of ushort to be optimized. This patch relax this restrication on the vload path.
Signed-off-by: Zhigang Gong <[email protected]> --- backend/src/backend/gen_context.cpp | 6 ++-- backend/src/backend/gen_insn_selection.cpp | 39 +++++++++++------------- backend/src/llvm/llvm_gen_backend.cpp | 3 +- backend/src/llvm/llvm_loadstore_optimization.cpp | 3 +- 4 files changed, 24 insertions(+), 27 deletions(-) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index ba4a8f8..883fa39 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -1693,7 +1693,7 @@ namespace gbe void GenContext::emitUnpackByteInstruction(const SelectionInstruction &insn) { const GenRegister src = ra->genReg(insn.src(0)); for(uint32_t i = 0; i < insn.dstNum; i++) { - p->MOV(ra->genReg(insn.dst(i)), GenRegister::splitReg(src, insn.dstNum, i)); + p->MOV(ra->genReg(insn.dst(i)), GenRegister::splitReg(src, insn.extra.elem, i)); } } @@ -1702,12 +1702,12 @@ namespace gbe p->push(); if(simdWidth == 8) { for(uint32_t i = 0; i < insn.srcNum; i++) - p->MOV(GenRegister::splitReg(dst, insn.srcNum, i), ra->genReg(insn.src(i))); + p->MOV(GenRegister::splitReg(dst, insn.extra.elem, i), ra->genReg(insn.src(i))); } else { // when destination expands two registers, the source must span two registers. p->curr.execWidth = 8; for(uint32_t i = 0; i < insn.srcNum; i++) { - GenRegister dsti = GenRegister::splitReg(dst, insn.srcNum, i); + GenRegister dsti = GenRegister::splitReg(dst, insn.extra.elem, i); GenRegister src = ra->genReg(insn.src(i)); p->curr.quarterControl = 0; diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 8478616..1258e54 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -575,10 +575,10 @@ namespace gbe void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti); /*! DWord scatter (for constant cache read) */ void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti); - /*! Unpack the uint to char4 */ - void UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemNum); - /*! pack the char4 to uint */ - void PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemNum); + /*! Unpack the uint to charN */ + void UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemSize, uint32_t elemNum); + /*! pack the charN to uint */ + void PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemSize, uint32_t elemNum); /*! Extended math function (2 arguments) */ void MATH(Reg dst, uint32_t function, Reg src0, Reg src1); /*! Extended math function (1 argument) */ @@ -1255,16 +1255,18 @@ namespace gbe srcVector->reg = &insn->src(0); } - void Selection::Opaque::UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemNum) { + void Selection::Opaque::UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemSize, uint32_t elemNum) { SelectionInstruction *insn = this->appendInsn(SEL_OP_UNPACK_BYTE, elemNum, 1); insn->src(0) = src; + insn->extra.elem = 4 / elemSize; for(uint32_t i = 0; i < elemNum; i++) insn->dst(i) = dst[i]; } - void Selection::Opaque::PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemNum) { + void Selection::Opaque::PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemSize, uint32_t elemNum) { SelectionInstruction *insn = this->appendInsn(SEL_OP_PACK_BYTE, 1, elemNum); for(uint32_t i = 0; i < elemNum; i++) insn->src(i) = src[i]; + insn->extra.elem = 4 / elemSize; insn->dst(0) = dst; } @@ -2862,9 +2864,7 @@ namespace gbe for(uint32_t i = 0; i < valueNum; i++) dst[i] = sel.selReg(insn.getValue(i), getType(family)); - uint32_t tmpRegNum = typeSize*valueNum / 4; - if (tmpRegNum == 0) - tmpRegNum = 1; + uint32_t tmpRegNum = (typeSize*valueNum + 3) / 4; vector<GenRegister> tmp(tmpRegNum); vector<GenRegister> tmp2(tmpRegNum); vector<Register> tmpReg(tmpRegNum); @@ -2875,15 +2875,10 @@ namespace gbe readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti); - if (valueNum > 1) { - for(uint32_t i = 0; i < tmpRegNum; i++) - sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize); - } - else { - if (elemSize == GEN_BYTE_SCATTER_WORD) - sel.MOV(GenRegister::retype(dst[0], GEN_TYPE_UW), sel.unpacked_uw(tmpReg[0])); - else if (elemSize == GEN_BYTE_SCATTER_BYTE) - sel.MOV(GenRegister::retype(dst[0], GEN_TYPE_UB), sel.unpacked_ub(tmpReg[0])); + for(uint32_t i = 0; i < tmpRegNum; i++) { + unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ? + 4/typeSize : (valueNum - i * (4 / typeSize)); + sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], typeSize, elemNum); } } @@ -2948,7 +2943,7 @@ namespace gbe for(uint32_t i = 0; i < valueNum; i++) dst[i] = sel.selReg(insn.getValue(i), getType(family)); - uint32_t effectDataNum = typeSize*valueNum / 4; + uint32_t effectDataNum = (typeSize*valueNum + 3) / 4; vector<GenRegister> tmp(effectDataNum + 1); vector<GenRegister> tmp2(effectDataNum + 1); vector<GenRegister> effectData(effectDataNum); @@ -2986,7 +2981,9 @@ namespace gbe getEffectByteData(sel, effectData, tmp, effectDataNum, address, simdWidth); for(uint32_t i = 0; i < effectDataNum; i++) { - sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, effectData[i], 4/typeSize); + unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ? + 4/typeSize : (valueNum - i * (4 / typeSize)); + sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, effectData[i], typeSize, elemNum); } } else { GBE_ASSERT(insn.getValueNum() == 1); @@ -3148,7 +3145,7 @@ namespace gbe vector<GenRegister> tmp(tmpRegNum); for(uint32_t i = 0; i < tmpRegNum; i++) { tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); - sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, 4/typeSize); + sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, typeSize, 4/typeSize); } sel.UNTYPED_WRITE(addr, tmp.data(), tmpRegNum, bti); diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index b956bc6..8f0d5c2 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -3515,7 +3515,8 @@ handle_write_image: emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned); } } - else if((dataFamily==ir::FAMILY_WORD && elemNum%2==0) || (dataFamily == ir::FAMILY_BYTE && elemNum%4 == 0)) { + else if((dataFamily == ir::FAMILY_WORD && (isLoad || elemNum % 2 == 0)) || + (dataFamily == ir::FAMILY_BYTE && (isLoad || elemNum % 4 == 0))) { emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned); } else { for (uint32_t elemID = 0; elemID < elemNum; elemID++) { diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp index 19726b0..ae91af7 100644 --- a/backend/src/llvm/llvm_loadstore_optimization.cpp +++ b/backend/src/llvm/llvm_loadstore_optimization.cpp @@ -259,8 +259,7 @@ namespace gbe { while(size > 1) { unsigned vecSize = (size >= 16) ? 16 : (size >= 8 ? 8 : - (size >= 4 ? 4 : - (size >= 2 ? 2 : size))); + (size >= 4 ? 4 : size)); SmallVector<Instruction*, 16> mergedVec(merged.begin() + pos, merged.begin() + pos + vecSize); if(isLoad) mergeLoad(BB, mergedVec); -- 1.8.3.2 _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
