From: Luo Xionghu <xionghu....@intel.com> Signed-off-by: Luo Xionghu <xionghu....@intel.com> --- backend/src/backend/gen_insn_selection.cpp | 55 ++++++++++++-- backend/src/ir/instruction.cpp | 14 +++- backend/src/ir/instruction.hpp | 4 +- backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 117 ++++++++++++++++++++++++----- backend/src/libocl/tmpl/ocl_simd.tmpl.h | 17 +++++ backend/src/llvm/llvm_gen_backend.cpp | 89 +++++++++++++++++++++- backend/src/llvm/llvm_gen_ocl_function.hxx | 6 ++ backend/src/llvm/llvm_scalarize.cpp | 5 ++ 8 files changed, 274 insertions(+), 33 deletions(-)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 1cab40c..cabc6a3 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -7811,25 +7811,56 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp /*! Media Block Read pattern */ DECL_PATTERN(MediaBlockReadInstruction) { + uint32_t fixBlockSize(const ir::MediaBlockReadInstruction &insn, uint32_t typeSize, uint32_t simdWidth, uint32_t &block_width) const + { + uint8_t width = insn.getWidth(); + uint8_t height = insn.getHeight(); + uint32_t vec_size = insn.getVectorSize(); + uint32_t blocksize = 0; + if (width && height) { + if (width * height * typeSize > vec_size * simdWidth * typeSize) { + if (width <= simdWidth * vec_size) { + height = vec_size * simdWidth / width; + } else { + height = 1; + width = vec_size * simdWidth / height; + } + } + }else { + width = simdWidth; + height = vec_size; + } + block_width = typeSize * (width < simdWidth ? width : simdWidth); + blocksize = (block_width - 1) % 32 | (height - 1) << 16; + return blocksize; + } + bool emitOne(Selection::Opaque &sel, const ir::MediaBlockReadInstruction &insn, bool &markChildren) const { using namespace ir; uint32_t vec_size = insn.getVectorSize(); uint32_t simdWidth = sel.curr.execWidth; const Type type = insn.getType(); - const uint32_t typeSize = type == TYPE_U32 ? 4 : 2; + uint32_t typeSize = 0; + if(type == TYPE_U32) { + typeSize = 4; + }else if(type == TYPE_U16) { + typeSize = 2; + }else if(type == TYPE_U8) { + typeSize = 1; + }else + NOT_IMPLEMENTED; uint32_t response_size = simdWidth * vec_size * typeSize / 32; // ushort in simd8 will have half reg thus 0.5 reg size, but response lenght is still 1 response_size = response_size ? response_size : 1; - uint32_t block_width = typeSize * simdWidth; - uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16; - + uint32_t block_width = 0; + uint32_t blocksize = fixBlockSize(insn, typeSize, simdWidth, block_width); vector<GenRegister> valuesVec; vector<GenRegister> tmpVec; for (uint32_t i = 0; i < vec_size; ++i) { valuesVec.push_back(sel.selReg(insn.getDst(i), type)); - if(simdWidth == 16 && typeSize == 4) + if((simdWidth == 16 && typeSize == 4) || typeSize == 1) tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG))); } const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD); @@ -7855,15 +7886,23 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp sel.MOV(blocksizereg, GenRegister::immud(blocksize)); sel.pop(); - if (simdWidth * typeSize < 64) { + if (block_width < 64) { sel.push(); sel.curr.execWidth = 8; sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; // Now read the data - sel.MBREAD(&valuesVec[0], vec_size, header, insn.getImageIndex(), response_size); + if(typeSize == 1) { + sel.MBREAD(&tmpVec[0], vec_size, header, insn.getImageIndex(), response_size); + for (uint32_t i = 0; i < vec_size; i++) { + sel.MOV(valuesVec[i], sel.getOffsetReg(GenRegister::retype(tmpVec[0], GEN_TYPE_UB), 0, i*simdWidth)); + sel.MOV(sel.getOffsetReg(valuesVec[i], 0, 16), sel.getOffsetReg(GenRegister::retype(tmpVec[0], GEN_TYPE_UB), 0, i*simdWidth + 8)); + } + }else + sel.MBREAD(&valuesVec[0], vec_size, header, insn.getImageIndex(), response_size); + sel.pop(); - } else if (simdWidth * typeSize == 64) { + } else if (block_width == 64) { sel.push(); sel.curr.execWidth = 8; sel.curr.predicate = GEN_PREDICATE_NONE; diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp index f0c3957..4b87e4a 100644 --- a/backend/src/ir/instruction.cpp +++ b/backend/src/ir/instruction.cpp @@ -1070,7 +1070,7 @@ namespace ir { public TupleDstPolicy<MediaBlockReadInstruction> { public: - INLINE MediaBlockReadInstruction(uint8_t imageIdx, Tuple dst, uint8_t vec_size, Tuple srcTuple, uint8_t srcNum, Type type) { + INLINE MediaBlockReadInstruction(uint8_t imageIdx, Tuple dst, uint8_t vec_size, Tuple srcTuple, uint8_t srcNum, Type type, uint8_t width = 0, uint8_t height = 0) { this->opcode = OP_MBREAD; this->dst = dst; this->dstNum = vec_size; @@ -1078,6 +1078,8 @@ namespace ir { this->srcNum = srcNum; this->imageIdx = imageIdx; this->type = type; + this->width = width; + this->height = height; } INLINE bool wellFormed(const Function &fn, std::string &why) const; INLINE void out(std::ostream &out, const Function &fn) const { @@ -1095,6 +1097,8 @@ namespace ir { INLINE uint8_t getImageIndex(void) const { return this->imageIdx; } INLINE uint8_t getVectorSize(void) const { return this->dstNum; } INLINE Type getType(void) const { return this->type; } + INLINE uint8_t getWidth(void) const { return this->width; } + INLINE uint8_t getHeight(void) const { return this->height; } Tuple src; Tuple dst; @@ -1102,6 +1106,8 @@ namespace ir { uint8_t srcNum; uint8_t dstNum; Type type; + uint8_t width; + uint8_t height; }; class ALIGNED_INSTRUCTION MediaBlockWriteInstruction : @@ -2409,6 +2415,8 @@ DECL_MEM_FN(PrintfInstruction, Type, getType(const Function& fn, uint32_t ID), g DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getImageIndex(void), getImageIndex()) DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getVectorSize(void), getVectorSize()) DECL_MEM_FN(MediaBlockReadInstruction, Type, getType(void), getType()) +DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getWidth(void), getWidth()) +DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getHeight(void), getHeight()) DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getImageIndex(void), getImageIndex()) DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getVectorSize(void), getVectorSize()) DECL_MEM_FN(MediaBlockWriteInstruction, Type, getType(void), getType()) @@ -2720,8 +2728,8 @@ DECL_MEM_FN(MemInstruction, void, setBtiReg(Register reg), setBtiReg(reg)) return internal::PrintfInstruction(dst, srcTuple, typeTuple, srcNum, bti, num).convert(); } - Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum, Type type) { - return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size, coord, srcNum, type).convert(); + Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum, Type type, uint8_t width, uint8_t height) { + return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size, coord, srcNum, type, width, height).convert(); } Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type) { diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp index 16c2045..7e90576 100644 --- a/backend/src/ir/instruction.hpp +++ b/backend/src/ir/instruction.hpp @@ -646,6 +646,8 @@ namespace ir { uint8_t getImageIndex() const; uint8_t getVectorSize() const; Type getType(void) const; + uint8_t getWidth() const; + uint8_t getHeight() const; }; /*! Media Block Write. */ @@ -893,7 +895,7 @@ namespace ir { /*! printf */ Instruction PRINTF(Register dst, Tuple srcTuple, Tuple typeTuple, uint8_t srcNum, uint8_t bti, uint16_t num); /*! media block read */ - Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum, Type type); + Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum, Type type, uint8_t width, uint8_t height); /*! media block write */ Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type); } /* namespace ir */ diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl index 97e33fe..55bf6f0 100644 --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl @@ -262,41 +262,61 @@ OVERLOADABLE void intel_sub_group_block_write_ui8(global uint* p,uint8 data) __gen_ocl_sub_group_block_write_ui_mem8(p, data); } -PURE CONST uint __gen_ocl_sub_group_block_read_ui_image(image2d_t p, int x, int y); -PURE CONST uint2 __gen_ocl_sub_group_block_read_ui_image2(image2d_t p, int x, int y); -PURE CONST uint4 __gen_ocl_sub_group_block_read_ui_image4(image2d_t p, int x, int y); -PURE CONST uint8 __gen_ocl_sub_group_block_read_ui_image8(image2d_t p, int x, int y); +PURE CONST uint __gen_ocl_sub_group_block_read_ui_image(image2d_t p, int x, int y, int w, int h); +PURE CONST uint2 __gen_ocl_sub_group_block_read_ui_image2(image2d_t p, int x, int y, int w, int h); +PURE CONST uint4 __gen_ocl_sub_group_block_read_ui_image4(image2d_t p, int x, int y, int w, int h); +PURE CONST uint8 __gen_ocl_sub_group_block_read_ui_image8(image2d_t p, int x, int y, int w, int h); OVERLOADABLE uint intel_sub_group_block_read(image2d_t p, int2 cord) { - return __gen_ocl_sub_group_block_read_ui_image(p, cord.x, cord.y); + return __gen_ocl_sub_group_block_read_ui_image(p, cord.x, cord.y, 0, 0); } OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t p, int2 cord) { - return __gen_ocl_sub_group_block_read_ui_image2(p, cord.x, cord.y); + return __gen_ocl_sub_group_block_read_ui_image2(p, cord.x, cord.y, 0, 0); } OVERLOADABLE uint4 intel_sub_group_block_read4(image2d_t p, int2 cord) { - return __gen_ocl_sub_group_block_read_ui_image4(p, cord.x, cord.y); + return __gen_ocl_sub_group_block_read_ui_image4(p, cord.x, cord.y, 0, 0); } OVERLOADABLE uint8 intel_sub_group_block_read8(image2d_t p, int2 cord) { - return __gen_ocl_sub_group_block_read_ui_image8(p, cord.x, cord.y); + return __gen_ocl_sub_group_block_read_ui_image8(p, cord.x, cord.y, 0, 0); } OVERLOADABLE uint intel_sub_group_block_read_ui(image2d_t p, int2 cord) { - return __gen_ocl_sub_group_block_read_ui_image(p, cord.x, cord.y); + return __gen_ocl_sub_group_block_read_ui_image(p, cord.x, cord.y, 0, 0); } OVERLOADABLE uint2 intel_sub_group_block_read_ui2(image2d_t p, int2 cord) { - return __gen_ocl_sub_group_block_read_ui_image2(p, cord.x, cord.y); + return __gen_ocl_sub_group_block_read_ui_image2(p, cord.x, cord.y, 0, 0); } OVERLOADABLE uint4 intel_sub_group_block_read_ui4(image2d_t p, int2 cord) { - return __gen_ocl_sub_group_block_read_ui_image4(p, cord.x, cord.y); + return __gen_ocl_sub_group_block_read_ui_image4(p, cord.x, cord.y, 0, 0); } OVERLOADABLE uint8 intel_sub_group_block_read_ui8(image2d_t p, int2 cord) { - return __gen_ocl_sub_group_block_read_ui_image8(p, cord.x, cord.y); + return __gen_ocl_sub_group_block_read_ui_image8(p, cord.x, cord.y, 0, 0); +} + +OVERLOADABLE uint intel_sub_group_media_block_read_ui(int2 src_byte_offset, int width, int height, read_only image2d_t image) +{ + return __gen_ocl_sub_group_block_read_ui_image(image, src_byte_offset.x, src_byte_offset.y, width, height); +} + +OVERLOADABLE uint2 intel_sub_group_media_block_read_ui2(int2 src_byte_offset, int width, int height, read_only image2d_t image) +{ + return __gen_ocl_sub_group_block_read_ui_image2(image, src_byte_offset.x, src_byte_offset.y, width, height); +} + +OVERLOADABLE uint4 intel_sub_group_media_block_read_ui4(int2 src_byte_offset, int width, int height, read_only image2d_t image) +{ + return __gen_ocl_sub_group_block_read_ui_image4(image, src_byte_offset.x, src_byte_offset.y, width, height); +} + +OVERLOADABLE uint8 intel_sub_group_media_block_read_ui8(int2 src_byte_offset, int width, int height, read_only image2d_t image) +{ + return __gen_ocl_sub_group_block_read_ui_image8(image, src_byte_offset.x, src_byte_offset.y, width, height); } void __gen_ocl_sub_group_block_write_ui_image(image2d_t p, int x, int y, uint data); @@ -378,25 +398,51 @@ OVERLOADABLE void intel_sub_group_block_write_us8(global ushort* p,ushort8 data) __gen_ocl_sub_group_block_write_us_mem8(p, data); } -PURE CONST ushort __gen_ocl_sub_group_block_read_us_image(image2d_t p, int x, int y); -PURE CONST ushort2 __gen_ocl_sub_group_block_read_us_image2(image2d_t p, int x, int y); -PURE CONST ushort4 __gen_ocl_sub_group_block_read_us_image4(image2d_t p, int x, int y); -PURE CONST ushort8 __gen_ocl_sub_group_block_read_us_image8(image2d_t p, int x, int y); +PURE CONST ushort __gen_ocl_sub_group_block_read_us_image(image2d_t p, int x, int y, int w, int h); +PURE CONST ushort2 __gen_ocl_sub_group_block_read_us_image2(image2d_t p, int x, int y, int w, int h); +PURE CONST ushort4 __gen_ocl_sub_group_block_read_us_image4(image2d_t p, int x, int y, int w, int h); +PURE CONST ushort8 __gen_ocl_sub_group_block_read_us_image8(image2d_t p, int x, int y, int w, int h); +PURE CONST ushort16 __gen_ocl_sub_group_block_read_us_image16(image2d_t p, int x, int y, int w, int h); OVERLOADABLE ushort intel_sub_group_block_read_us(image2d_t p, int2 cord) { - return __gen_ocl_sub_group_block_read_us_image(p, cord.x, cord.y); + return __gen_ocl_sub_group_block_read_us_image(p, cord.x, cord.y, 0, 0); } OVERLOADABLE ushort2 intel_sub_group_block_read_us2(image2d_t p, int2 cord) { - return __gen_ocl_sub_group_block_read_us_image2(p, cord.x, cord.y); + return __gen_ocl_sub_group_block_read_us_image2(p, cord.x, cord.y, 0, 0); } OVERLOADABLE ushort4 intel_sub_group_block_read_us4(image2d_t p, int2 cord) { - return __gen_ocl_sub_group_block_read_us_image4(p, cord.x, cord.y); + return __gen_ocl_sub_group_block_read_us_image4(p, cord.x, cord.y, 0, 0); } OVERLOADABLE ushort8 intel_sub_group_block_read_us8(image2d_t p, int2 cord) { - return __gen_ocl_sub_group_block_read_us_image8(p, cord.x, cord.y); + return __gen_ocl_sub_group_block_read_us_image8(p, cord.x, cord.y, 0, 0); +} + +OVERLOADABLE ushort intel_sub_group_media_block_read_us(int2 src_byte_offset, int width, int height, read_only image2d_t image) +{ + return __gen_ocl_sub_group_block_read_us_image(image, src_byte_offset.x, src_byte_offset.y, width, height); +} + +OVERLOADABLE ushort2 intel_sub_group_media_block_read_us2(int2 src_byte_offset, int width, int height, read_only image2d_t image) +{ + return __gen_ocl_sub_group_block_read_us_image2(image, src_byte_offset.x, src_byte_offset.y, width, height); +} + +OVERLOADABLE ushort4 intel_sub_group_media_block_read_us4(int2 src_byte_offset, int width, int height, read_only image2d_t image) +{ + return __gen_ocl_sub_group_block_read_us_image4(image, src_byte_offset.x, src_byte_offset.y, width, height); +} + +OVERLOADABLE ushort8 intel_sub_group_media_block_read_us8(int2 src_byte_offset, int width, int height, read_only image2d_t image) +{ + return __gen_ocl_sub_group_block_read_us_image8(image, src_byte_offset.x, src_byte_offset.y, width, height); +} + +OVERLOADABLE ushort16 intel_sub_group_media_block_read_us16(int2 src_byte_offset, int width, int height, read_only image2d_t image) +{ + return __gen_ocl_sub_group_block_read_us_image16(image, src_byte_offset.x, src_byte_offset.y, width, height); } void __gen_ocl_sub_group_block_write_us_image(image2d_t p, int x, int y, ushort data); @@ -419,6 +465,37 @@ OVERLOADABLE void intel_sub_group_block_write_us8(image2d_t p, int2 cord, ushort { __gen_ocl_sub_group_block_write_us_image8(p, cord.x, cord.y, data); } + +PURE CONST uchar __gen_ocl_sub_group_block_read_uc_image(image2d_t p, int x, int y, int w, int h); +PURE CONST uchar2 __gen_ocl_sub_group_block_read_uc_image2(image2d_t p, int x, int y, int w, int h); +PURE CONST uchar4 __gen_ocl_sub_group_block_read_uc_image4(image2d_t p, int x, int y, int w, int h); +PURE CONST uchar8 __gen_ocl_sub_group_block_read_uc_image8(image2d_t p, int x, int y, int w, int h); +PURE CONST uchar16 __gen_ocl_sub_group_block_read_uc_image16(image2d_t p, int x, int y, int w, int h); +OVERLOADABLE uchar intel_sub_group_media_block_read_uc(int2 src_byte_offset, int width, int height, read_only image2d_t image) +{ + return __gen_ocl_sub_group_block_read_uc_image(image, src_byte_offset.x, src_byte_offset.y, width, height); +} + +OVERLOADABLE uchar2 intel_sub_group_media_block_read_uc2(int2 src_byte_offset, int width, int height, read_only image2d_t image) +{ + return __gen_ocl_sub_group_block_read_uc_image2(image, src_byte_offset.x, src_byte_offset.y, width, height); +} + +OVERLOADABLE uchar4 intel_sub_group_media_block_read_uc4(int2 src_byte_offset, int width, int height, read_only image2d_t image) +{ + return __gen_ocl_sub_group_block_read_uc_image4(image, src_byte_offset.x, src_byte_offset.y, width, height); +} + +OVERLOADABLE uchar8 intel_sub_group_media_block_read_uc8(int2 src_byte_offset, int width, int height, read_only image2d_t image) +{ + return __gen_ocl_sub_group_block_read_uc_image8(image, src_byte_offset.x, src_byte_offset.y, width, height); +} + +OVERLOADABLE uchar16 intel_sub_group_media_block_read_uc16(int2 src_byte_offset, int width, int height, read_only image2d_t image) +{ + return __gen_ocl_sub_group_block_read_uc_image16(image, src_byte_offset.x, src_byte_offset.y, width, height); +} + #define SHUFFLE_DOWN(TYPE) \ OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE y, uint c) { \ TYPE res0, res1; \ diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h index 608551b..2592d10 100644 --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h @@ -250,3 +250,20 @@ OVERLOADABLE void intel_sub_group_block_write_us(image2d_t image, int2 byte_coor OVERLOADABLE void intel_sub_group_block_write_us2(image2d_t image, int2 byte_coord, ushort2 data); OVERLOADABLE void intel_sub_group_block_write_us4(image2d_t image, int2 byte_coord, ushort4 data); OVERLOADABLE void intel_sub_group_block_write_us8(image2d_t image, int2 byte_coord, ushort8 data); + +OVERLOADABLE uchar intel_sub_group_media_block_read_uc(int2 src_byte_offset, int width, int height, read_only image2d_t image); +OVERLOADABLE uchar2 intel_sub_group_media_block_read_uc2(int2 src_byte_offset, int width, int height, read_only image2d_t image); +OVERLOADABLE uchar4 intel_sub_group_media_block_read_uc4(int2 src_byte_offset, int width, int height, read_only image2d_t image); +OVERLOADABLE uchar8 intel_sub_group_media_block_read_uc8(int2 src_byte_offset, int width, int height, read_only image2d_t image); +OVERLOADABLE uchar16 intel_sub_group_media_block_read_uc16(int2 src_byte_offset, int width, int height, read_only image2d_t image); + +OVERLOADABLE ushort intel_sub_group_media_block_read_us(int2 src_byte_offset, int width, int height, read_only image2d_t image); +OVERLOADABLE ushort2 intel_sub_group_media_block_read_us2(int2 src_byte_offset, int width, int height, read_only image2d_t image); +OVERLOADABLE ushort4 intel_sub_group_media_block_read_us4(int2 src_byte_offset, int width, int height, read_only image2d_t image); +OVERLOADABLE ushort8 intel_sub_group_media_block_read_us8(int2 src_byte_offset, int width, int height, read_only image2d_t image); +OVERLOADABLE ushort16 intel_sub_group_media_block_read_us16(int2 src_byte_offset, int width, int height, read_only image2d_t image); + +OVERLOADABLE uint intel_sub_group_media_block_read_ui(int2 src_byte_offset, int width, int height, read_only image2d_t image); +OVERLOADABLE uint2 intel_sub_group_media_block_read_ui2(int2 src_byte_offset, int width, int height, read_only image2d_t image); +OVERLOADABLE uint4 intel_sub_group_media_block_read_ui4(int2 src_byte_offset, int width, int height, read_only image2d_t image); +OVERLOADABLE uint8 intel_sub_group_media_block_read_ui8(int2 src_byte_offset, int width, int height, read_only image2d_t image); diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index 3fefa92..faa9c37 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -737,6 +737,7 @@ namespace gbe // Emit subgroup instructions void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type = ir::TYPE_U32); void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type = ir::TYPE_U32); + void checkMediaBlockWidthandHeight(CallInst &I, uint8_t width, uint8_t height, uint8_t vec_size, ir::Type type); uint8_t appendSampler(CallSite::arg_iterator AI); uint8_t getImageID(CallInst &I); @@ -4059,6 +4060,12 @@ namespace gbe case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE2: case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE4: case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE8: + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE16: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE2: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE4: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE8: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE16: case GEN_OCL_ENQUEUE_SET_NDRANGE_INFO: case GEN_OCL_ENQUEUE_GET_NDRANGE_INFO: this->newRegister(&I); @@ -4463,6 +4470,61 @@ namespace gbe GBE_ASSERT(AI == AE); } + void GenWriter::checkMediaBlockWidthandHeight(CallInst& I, uint8_t width, uint8_t height, uint8_t vec_size, ir::Type type) { + if (width == 0) { + has_errors = true; + Func->getContext().emitError(&I,"Media Block width value illegal, width is:" + width); + ctx.getUnit().setValid(false); + return; + } + if (height == 0) { + has_errors = true; + Func->getContext().emitError(&I,"Media Block height value illegal, height is:" + height); + ctx.getUnit().setValid(false); + return; + } + uint32_t typeSize; + if (type == ir::TYPE_U8) + typeSize = 1; + else if (type == ir::TYPE_U16) + typeSize = 2; + else + typeSize = 4; + + uint32_t widthBytes = width * typeSize; + + uint32_t maxRows; + if (widthBytes <= 4) + maxRows = 64; + else if (widthBytes <= 8) + maxRows = 32; + else if (widthBytes <= 16) + maxRows = 16; + else + maxRows = 8; + + if (widthBytes % 4 != 0) { + has_errors = true; + Func->getContext().emitError(&I,"Media Block widthBytes value illegal, widthBytes is:" + widthBytes); + ctx.getUnit().setValid(false); + return; + } + + if ((typeSize == 4 && widthBytes > 64) || (typeSize != 4 && widthBytes > 32)) { + has_errors = true; + Func->getContext().emitError(&I,"Media Block widthBytes value illegal, widthBytes is:" + widthBytes); + ctx.getUnit().setValid(false); + return; + } + + if (height > maxRows) { + has_errors = true; + Func->getContext().emitError(&I,"Media Block height value illegal, height is larger than: " + maxRows); + ctx.getUnit().setValid(false); + return; + } + } + void GenWriter::emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type type) { CallSite::arg_iterator AI = CS.arg_begin(); CallSite::arg_iterator AE = CS.arg_end(); @@ -4489,7 +4551,20 @@ namespace gbe dstTupleData.push_back(getRegister(&I, i)); const ir::Tuple srctuple = ctx.arrayTuple(src, 2); const ir::Tuple dsttuple = ctx.arrayTuple(&dstTupleData[0], vec_size); - ctx.MBREAD(imageID, dsttuple, vec_size, srctuple, 2, type); + Constant *CWidth = dyn_cast<Constant>(*AI++); + GBE_ASSERT(CWidth != NULL); + const ir::Immediate &width = processConstantImm(CWidth); + Constant *CHeight = dyn_cast<Constant>(*AI++); + GBE_ASSERT(CHeight != NULL); + const ir::Immediate &height = processConstantImm(CHeight); + // check width and height legality. + if (width.getIntegerValue() != 0 || height.getIntegerValue() != 0) { + checkMediaBlockWidthandHeight(I, width.getIntegerValue(), height.getIntegerValue(), vec_size, type); + if(!ctx.getUnit().getValid()) + return; + } + //map w * h region to simd_size + ctx.MBREAD(imageID, dsttuple, vec_size, srctuple, 2, type, width.getIntegerValue(), height.getIntegerValue()); } GBE_ASSERT(AI == AE); @@ -5473,6 +5548,18 @@ namespace gbe this->emitBlockReadWriteImageInst(I, CS, false, 4, ir::TYPE_U16); break; case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE8: this->emitBlockReadWriteImageInst(I, CS, false, 8, ir::TYPE_U16); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE16: + this->emitBlockReadWriteImageInst(I, CS, false, 16, ir::TYPE_U16); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE: + this->emitBlockReadWriteImageInst(I, CS, false, 1, ir::TYPE_U8); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE2: + this->emitBlockReadWriteImageInst(I, CS, false, 2, ir::TYPE_U8); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE4: + this->emitBlockReadWriteImageInst(I, CS, false, 4, ir::TYPE_U8); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE8: + this->emitBlockReadWriteImageInst(I, CS, false, 8, ir::TYPE_U8); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE16: + this->emitBlockReadWriteImageInst(I, CS, false, 16, ir::TYPE_U8); break; case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE: this->emitBlockReadWriteImageInst(I, CS, true, 1, ir::TYPE_U16); break; case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE2: diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx index 86485da..0243f05 100644 --- a/backend/src/llvm/llvm_gen_ocl_function.hxx +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx @@ -250,10 +250,16 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE, __gen_ocl_sub_group_block_ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE2, __gen_ocl_sub_group_block_read_us_image2) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE4, __gen_ocl_sub_group_block_read_us_image4) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE8, __gen_ocl_sub_group_block_read_us_image8) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE16, __gen_ocl_sub_group_block_read_us_image16) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE, __gen_ocl_sub_group_block_write_us_image) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE2, __gen_ocl_sub_group_block_write_us_image2) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE4, __gen_ocl_sub_group_block_write_us_image4) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE8, __gen_ocl_sub_group_block_write_us_image8) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE, __gen_ocl_sub_group_block_read_uc_image) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE2, __gen_ocl_sub_group_block_read_uc_image2) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE4, __gen_ocl_sub_group_block_read_uc_image4) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE8, __gen_ocl_sub_group_block_read_uc_image8) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE16, __gen_ocl_sub_group_block_read_uc_image16) // common function DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp) diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp index 8850abb..c413ab4 100644 --- a/backend/src/llvm/llvm_scalarize.cpp +++ b/backend/src/llvm/llvm_scalarize.cpp @@ -723,6 +723,11 @@ namespace gbe { case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE2: case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE4: case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE8: + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE16: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE2: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE4: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE8: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE16: setAppendPoint(call); extractFromVector(call); break; -- 2.5.0 _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet