From: Pan Xiuli <xiuli....@intel.com> Using owrod block read/write for a block of data for a thread. V2: Refine some register type.
Signed-off-by: Pan Xiuli <xiuli....@intel.com> --- backend/src/backend/gen/gen_mesa_disasm.c | 15 +++++ backend/src/backend/gen_context.cpp | 52 +++++++++++++++ backend/src/backend/gen_context.hpp | 2 + backend/src/backend/gen_encoder.cpp | 37 ++++++++++- backend/src/backend/gen_encoder.hpp | 4 ++ .../src/backend/gen_insn_gen7_schedule_info.hxx | 2 + backend/src/backend/gen_insn_selection.cpp | 77 ++++++++++++++++++++-- backend/src/backend/gen_insn_selection.hpp | 4 ++ backend/src/backend/gen_insn_selection.hxx | 2 + backend/src/ir/instruction.cpp | 26 ++++++-- backend/src/ir/instruction.hpp | 8 ++- backend/src/ir/liveness.cpp | 5 ++ backend/src/libocl/CMakeLists.txt | 2 +- backend/src/libocl/src/ocl_substore.ll | 9 +++ backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 54 +++++++++++++++ backend/src/libocl/tmpl/ocl_simd.tmpl.h | 11 ++++ backend/src/llvm/llvm_gen_backend.cpp | 65 ++++++++++++++++++ backend/src/llvm/llvm_gen_ocl_function.hxx | 5 +- 18 files changed, 365 insertions(+), 15 deletions(-) create mode 100644 backend/src/libocl/src/ocl_substore.ll diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c index 067ddd8..9200c26 100644 --- a/backend/src/backend/gen/gen_mesa_disasm.c +++ b/backend/src/backend/gen/gen_mesa_disasm.c @@ -432,6 +432,14 @@ static const char *data_port_data_cache_category[] = { "scratch", }; +static const char *data_port_data_cache_block_size[] = { + "1 OWORD LOW", + "1 OWORD HIGH", + "2 OWORD", + "4 OWORD", + "8 OWORD", +}; + static const char *data_port_scratch_block_size[] = { "1 register", "2 registers", @@ -576,6 +584,7 @@ static int gen_version; #define MSG_GW_ACKREQ(inst) GEN_BITS_FIELD(inst, bits3.gen7_msg_gw.ackreq) #define GENERIC_MSG_LENGTH(inst) GEN_BITS_FIELD(inst, bits3.generic_gen5.msg_length) #define GENERIC_RESPONSE_LENGTH(inst) GEN_BITS_FIELD(inst, bits3.generic_gen5.response_length) +#define OWORD_RW_BLOCK_SIZE(inst) GEN_BITS_FIELD(inst, bits3.gen7_oblock_rw.block_size) static int is_special_acc(const void* inst) { @@ -1483,6 +1492,12 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac data_port_data_cache_byte_scattered_simd_mode[BYTE_RW_SIMD_MODE(inst)], data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)], data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]); + else if(UNTYPED_RW_MSG_TYPE(inst) == 0 || UNTYPED_RW_MSG_TYPE(inst) == 8) + format(file, " (bti: %d, data size: %s, %s, %s)", + UNTYPED_RW_BTI(inst), + data_port_data_cache_block_size[OWORD_RW_BLOCK_SIZE(inst)], + data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)], + data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]); else format(file, " not implemented"); } else { diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 4d0a3f3..90b8b45 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -3487,6 +3487,58 @@ namespace gbe p->pop(); } + void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) { + const GenRegister dst = ra->genReg(insn.dst(0)); + const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD); + GenRegister header = GenRegister::retype(ra->genReg(insn.src(1)), GEN_TYPE_UD); + + p->push(); + // Copy r0 into the header first + p->curr.execWidth = 8; + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->MOV(header, GenRegister::ud8grf(0, 0)); + + // Update the header with the current address + p->curr.execWidth = 1; + p->SHR(GenRegister::offset(header, 0, 2*4), addr, GenRegister::immud(4)); + + // Put zero in the general state base address + p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0)); + + p->pop(); + // Now read the data + p->OBREAD(dst, header, insn.getbti(), insn.extra.elem); + } + + void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn) { + const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(2)), GEN_TYPE_UD); + GenRegister header; + if (simdWidth == 8) + header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD); + else + header = GenRegister::retype(GenRegister::Qn(ra->genReg(insn.src(0)),1), GEN_TYPE_UD); + + p->push(); + // Copy r0 into the header first + p->curr.execWidth = 8; + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->MOV(header, GenRegister::ud8grf(0,0)); + + // Update the header with the current address + p->curr.execWidth = 1; + p->SHR(GenRegister::offset(header, 0, 2*4), addr, GenRegister::immud(4)); + + // Put zero in the general state base address + p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0)); + + p->pop(); + // Now write the data + p->OBWRITE(header, insn.getbti(), insn.extra.elem); + } + + BVAR(OCL_OUTPUT_REG_ALLOC, false); BVAR(OCL_OUTPUT_ASM, false); diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp index 4c43ccb..a634338 100644 --- a/backend/src/backend/gen_context.hpp +++ b/backend/src/backend/gen_context.hpp @@ -187,6 +187,8 @@ namespace gbe void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode); unsigned beforeMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, GenRegister btiTmp, unsigned desc); void afterMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, GenRegister btiTmp, unsigned jip0); + void emitOBReadInstruction(const SelectionInstruction &insn); + void emitOBWriteInstruction(const SelectionInstruction &insn); /*! Implements base class */ virtual Kernel *allocateKernel(void); diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp index 31afa67..e745b9c 100644 --- a/backend/src/backend/gen_encoder.cpp +++ b/backend/src/backend/gen_encoder.cpp @@ -258,7 +258,7 @@ namespace gbe else NOT_SUPPORTED; } -#if 0 + static void setOBlockRW(GenEncoder *p, GenNativeInstruction *insn, uint32_t bti, @@ -275,7 +275,6 @@ namespace gbe insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : 3; insn->bits3.gen7_oblock_rw.header_present = 1; } -#endif static void setDWordScatterMessgae(GenEncoder *p, GenNativeInstruction *insn, @@ -1244,6 +1243,40 @@ namespace gbe setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_READ, 1, dst_num); } + void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) { + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); + const uint32_t msg_length = 1; + const uint32_t response_length = size / 2; // Size is in owords + this->setHeader(insn); + this->setDst(insn, GenRegister::uw16grf(dst.nr, 0)); + this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); + this->setSrc1(insn, GenRegister::immud(0)); + setOBlockRW(this, + insn, + bti, + size, + GEN7_OBLOCK_READ, + msg_length, + response_length); + } + + void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t size) { + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); + const uint32_t msg_length = 1 + size / 2; // Size is in owords + const uint32_t response_length = 0; + this->setHeader(insn); + this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); + this->setSrc1(insn, GenRegister::immud(0)); + this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW)); + setOBlockRW(this, + insn, + bti, + size, + GEN7_OBLOCK_WRITE, + msg_length, + response_length); + } + void GenEncoder::EOT(uint32_t msg) { GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD)); diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp index 0239293..a53c879 100644 --- a/backend/src/backend/gen_encoder.hpp +++ b/backend/src/backend/gen_encoder.hpp @@ -267,6 +267,10 @@ namespace gbe virtual bool canHandleLong(uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1 = GenRegister::null()); virtual void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1 = GenRegister::null()); + /*! OBlock read */ + void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize); + /*! OBlock write */ + void OBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize); GBE_CLASS(GenEncoder); //!< Use custom allocators virtual void alu3(uint32_t opcode, GenRegister dst, diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx index cb5c4f1..d297726 100644 --- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx +++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx @@ -50,3 +50,5 @@ DECL_GEN7_SCHEDULE(StoreProfiling, 80, 1, 1) DECL_GEN7_SCHEDULE(WorkGroupOp, 80, 1, 1) DECL_GEN7_SCHEDULE(SubGroupOp, 80, 1, 1) DECL_GEN7_SCHEDULE(Printf, 80, 1, 1) +DECL_GEN7_SCHEDULE(OBRead, 80, 1, 1) +DECL_GEN7_SCHEDULE(OBWrite, 80, 1, 1) diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 855c39d..e974e97 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -188,7 +188,8 @@ namespace gbe this->opcode == SEL_OP_BYTE_GATHER || this->opcode == SEL_OP_SAMPLE || this->opcode == SEL_OP_VME || - this->opcode == SEL_OP_DWORD_GATHER; + this->opcode == SEL_OP_DWORD_GATHER || + this->opcode == SEL_OP_OBREAD; } bool SelectionInstruction::modAcc(void) const { @@ -210,7 +211,8 @@ namespace gbe this->opcode == SEL_OP_WRITE64 || this->opcode == SEL_OP_ATOMIC || this->opcode == SEL_OP_BYTE_SCATTER || - this->opcode == SEL_OP_TYPED_WRITE; + this->opcode == SEL_OP_TYPED_WRITE || + this->opcode == SEL_OP_OBWRITE; } bool SelectionInstruction::isBranch(void) const { @@ -697,6 +699,11 @@ namespace gbe /*! Sub Group Operations */ void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src, GenRegister tmpData1, GenRegister tmpData2); + /*! Oblock read */ + void OBREAD(GenRegister dst, GenRegister addr, GenRegister header, uint32_t bti, uint32_t size); + /*! Oblock write */ + void OBWRITE(GenRegister addr, GenRegister value, GenRegister header, uint32_t bti, uint32_t size); + /* common functions for both binary instruction and sel_cmp and compare instruction. It will handle the IMM or normal register assignment, and will try to avoid LOADI as much as possible. */ @@ -2014,6 +2021,40 @@ namespace gbe insn->src(0) = src; insn->src(1) = tmpData2; } + void Selection::Opaque::OBREAD(GenRegister dst, + GenRegister addr, + GenRegister header, + uint32_t bti, + uint32_t size) { + SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1, 2); + insn->dst(0) = dst; + insn->src(0) = addr; + insn->src(1) = header; + insn->setbti(bti); + insn->extra.elem = size / sizeof(int[4]); // number of owords + } + + void Selection::Opaque::OBWRITE(GenRegister addr, + GenRegister value, + GenRegister header, + uint32_t bti, + uint32_t size) { + SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, 3); + SelectionVector *vector = this->appendVector(); + insn->src(0) = header; + insn->src(1) = value; + insn->src(2) = addr; + insn->state = this->curr; + insn->setbti(bti); + insn->extra.elem = size / sizeof(int[4]); // number of owords + + // We need to put the header and the data together + vector->regNum = 2; + vector->reg = &insn->src(0); + vector->offsetID = 0; + vector->isSrc = 1; + } + // Boiler plate to initialize the selection library at c++ pre-main static SelectionLibrary *selLib = NULL; @@ -4001,6 +4042,18 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp } } + void emitOWordRead(Selection::Opaque &sel, + const ir::LoadInstruction &insn, + GenRegister address, + ir::BTI bti) const + { + using namespace ir; + const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); + const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32); + const uint32_t simdWidth = sel.ctx.getSimdWidth(); + sel.OBREAD(value, address, header, bti.imm, simdWidth * sizeof(int)); + } + // check whether all binded table index point to constant memory INLINE bool isAllConstant(const ir::BTI &bti) const { if (bti.isConst && bti.imm == BTI_CONSTANT) @@ -4036,7 +4089,9 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp const uint32_t elemSize = getByteScatterGatherSize(sel, type); bool allConstant = isAllConstant(bti); - if (allConstant) { + if (insn.isBlock()) + this->emitOWordRead(sel, insn, address, bti); + else if (allConstant) { // XXX TODO read 64bit constant through constant cache // Per HW Spec, constant cache messages can read at least DWORD data. // So, byte/short data type, we have to read through data cache. @@ -4163,6 +4218,18 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp } } + void emitOWordWrite(Selection::Opaque &sel, + const ir::StoreInstruction &insn, + GenRegister address, + ir::BTI bti) const + { + using namespace ir; + const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32); + const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32); + const uint32_t simdWidth = sel.ctx.getSimdWidth(); + sel.OBWRITE(address, value, header, bti.imm, simdWidth * sizeof(int)); + } + virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const { using namespace ir; @@ -4184,7 +4251,9 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp assert(0 && "stateless not supported yet"); } - if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD) + if (insn.isBlock()) + this->emitOWordWrite(sel, insn, address, bti); + else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD) this->emitWrite64(sel, insn, address, bti); else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD) this->emitUntypedWrite(sel, insn, address, bti); diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp index 8d2e1da..51af686 100644 --- a/backend/src/backend/gen_insn_selection.hpp +++ b/backend/src/backend/gen_insn_selection.hpp @@ -175,6 +175,8 @@ namespace gbe INLINE uint32_t getbti() const { GBE_ASSERT(isRead() || isWrite()); switch (opcode) { + case SEL_OP_OBREAD: + case SEL_OP_OBWRITE: case SEL_OP_DWORD_GATHER: return extra.function; case SEL_OP_SAMPLE: return extra.rdbti; case SEL_OP_VME: return extra.vme_bti; @@ -188,6 +190,8 @@ namespace gbe INLINE void setbti(uint32_t bti) { GBE_ASSERT(isRead() || isWrite()); switch (opcode) { + case SEL_OP_OBREAD: + case SEL_OP_OBWRITE: case SEL_OP_DWORD_GATHER: extra.function = bti; return; case SEL_OP_SAMPLE: extra.rdbti = bti; return; case SEL_OP_VME: extra.vme_bti = bti; return; diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx index 0e11f9f..4a7caff 100644 --- a/backend/src/backend/gen_insn_selection.hxx +++ b/backend/src/backend/gen_insn_selection.hxx @@ -96,3 +96,5 @@ DECL_SELECTION_IR(STORE_PROFILING, StoreProfilingInstruction) DECL_SELECTION_IR(WORKGROUP_OP, WorkGroupOpInstruction) DECL_SELECTION_IR(SUBGROUP_OP, SubGroupOpInstruction) DECL_SELECTION_IR(PRINTF, PrintfInstruction) +DECL_SELECTION_IR(OBREAD, OBReadInstruction) +DECL_SELECTION_IR(OBWRITE, OBWriteInstruction) diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp index 47606b2..88491a7 100644 --- a/backend/src/ir/instruction.cpp +++ b/backend/src/ir/instruction.cpp @@ -483,10 +483,12 @@ namespace ir { AddressSpace AS, uint32_t _valueNum, bool dwAligned, - AddressMode AM) + AddressMode AM, + bool ifBlock = false) : MemInstruction(AM, AS, dwAligned, type, offset), valueNum(_valueNum), - values(dstValues) + values(dstValues), + ifBlock(ifBlock) { this->opcode = OP_LOAD; } @@ -519,9 +521,11 @@ namespace ir { } INLINE bool wellFormed(const Function &fn, std::string &why) const; INLINE void out(std::ostream &out, const Function &fn) const; + INLINE bool isBlock() const { return ifBlock; } uint8_t valueNum; Tuple values; + bool ifBlock; }; class ALIGNED_INSTRUCTION StoreInstruction : public MemInstruction, @@ -534,12 +538,14 @@ namespace ir { AddressSpace addrSpace, uint32_t valueNum, bool dwAligned, - AddressMode AM) + AddressMode AM, + bool ifBlock = false) : MemInstruction(AM, addrSpace, dwAligned, type, offset) { this->opcode = OP_STORE; this->values = values; this->valueNum = valueNum; + this->ifBlock = ifBlock; } INLINE unsigned getValueNum() const { return valueNum; } INLINE Register getValue(const Function &fn, unsigned id) const { @@ -565,9 +571,12 @@ namespace ir { } INLINE bool wellFormed(const Function &fn, std::string &why) const; INLINE void out(std::ostream &out, const Function &fn) const; + INLINE bool isBlock() const { return ifBlock; } + Register dst[0]; uint8_t valueNum; Tuple values; + bool ifBlock; }; class ALIGNED_INSTRUCTION SampleInstruction : // TODO @@ -1655,6 +1664,8 @@ namespace ir { } INLINE void LoadInstruction::out(std::ostream &out, const Function &fn) const { + if(ifBlock) + out<< "BLOCK"; this->outOpcode(out); out << "." << type << "." << AS << (dwAligned ? "." : ".un") << "aligned"; out << " {"; @@ -1672,6 +1683,8 @@ namespace ir { } INLINE void StoreInstruction::out(std::ostream &out, const Function &fn) const { + if(ifBlock) + out<< "BLOCK"; this->outOpcode(out); out << "." << type << "." << AS << (dwAligned ? "." : ".un") << "aligned"; out << " %" << this->getSrc(fn, 0) << " {"; @@ -2221,7 +2234,9 @@ DECL_MEM_FN(MemInstruction, bool, isAligned(void), isAligned()) DECL_MEM_FN(MemInstruction, unsigned, getAddressIndex(void), getAddressIndex()) DECL_MEM_FN(AtomicInstruction, AtomicOps, getAtomicOpcode(void), getAtomicOpcode()) DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum()) +DECL_MEM_FN(StoreInstruction, bool, isBlock(void), isBlock()) DECL_MEM_FN(LoadInstruction, uint32_t, getValueNum(void), getValueNum()) +DECL_MEM_FN(LoadInstruction, bool, isBlock(void), isBlock()) DECL_MEM_FN(LoadImmInstruction, Type, getType(void), getType()) DECL_MEM_FN(LabelInstruction, LabelIndex, getLabelIndex(void), getLabelIndex()) DECL_MEM_FN(BranchInstruction, bool, isPredicated(void), isPredicated()) @@ -2475,9 +2490,10 @@ DECL_MEM_FN(MemInstruction, void, setBtiReg(Register reg), setBtiReg(reg)) uint32_t valueNum, \ bool dwAligned, \ AddressMode AM, \ - unsigned SurfaceIndex) \ + unsigned SurfaceIndex, \ + bool isBlock) \ { \ - internal::CLASS insn = internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,AM); \ + internal::CLASS insn = internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,AM, isBlock); \ insn.setSurfaceIndex(SurfaceIndex);\ return insn.convert(); \ } \ diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp index a605f45..4e7d5b7 100644 --- a/backend/src/ir/instruction.hpp +++ b/backend/src/ir/instruction.hpp @@ -356,6 +356,8 @@ namespace ir { } /*! Return true if the given instruction is an instance of this class */ static bool isClassOf(const Instruction &insn); + /*! Return true if the given instruction is block write */ + bool isBlock() const; }; /*! Load instruction. The source is simply the address where to get the data. @@ -372,6 +374,8 @@ namespace ir { } /*! Return true if the given instruction is an instance of this class */ static bool isClassOf(const Instruction &insn); + /*! Return true if the given instruction is block read */ + bool isBlock() const; }; /*! Load immediate instruction loads an typed immediate value into the given @@ -827,10 +831,10 @@ namespace ir { /*! ret */ Instruction RET(void); /*! load.type.space {dst1,...,dst_valueNum} offset value, {bti} */ - Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned SurfaceIndex); + Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned SurfaceIndex, bool isBlock = false); Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, Register bti); /*! store.type.space offset {src1,...,src_valueNum} value {bti}*/ - Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned SurfaceIndex); + Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned SurfaceIndex, bool isBlock = false); Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, Register bti); /*! loadi.type dst value */ Instruction LOADI(Type type, Register dst, ImmediateIndex value); diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp index d48f067..3162d13 100644 --- a/backend/src/ir/liveness.cpp +++ b/backend/src/ir/liveness.cpp @@ -117,11 +117,16 @@ namespace ir { if (insn.getOpcode() == ir::OP_SIMD_ID) uniform = false; + // do not change dst uniform for block read + if (insn.getOpcode() == ir::OP_LOAD && ir::cast<ir::LoadInstruction>(insn).isBlock()) + uniform = false; + for (uint32_t srcID = 0; srcID < srcNum; ++srcID) { const Register reg = insn.getSrc(srcID); if (!fn.isUniformRegister(reg)) uniform = false; } + // A destination is a killed value for (uint32_t dstID = 0; dstID < dstNum; ++dstID) { const Register reg = insn.getDst(dstID); diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt index 1d1ec68..83e767c 100644 --- a/backend/src/libocl/CMakeLists.txt +++ b/backend/src/libocl/CMakeLists.txt @@ -182,7 +182,7 @@ MACRO(ADD_LL_TO_BC_TARGET M) ) ENDMACRO(ADD_LL_TO_BC_TARGET) -SET (OCL_LL_MODULES ocl_barrier ocl_clz) +SET (OCL_LL_MODULES ocl_barrier ocl_clz ocl_substore) FOREACH(f ${OCL_LL_MODULES}) COPY_THE_LL(${f}) ADD_LL_TO_BC_TARGET(${f}) diff --git a/backend/src/libocl/src/ocl_substore.ll b/backend/src/libocl/src/ocl_substore.ll new file mode 100644 index 0000000..665cdfa --- /dev/null +++ b/backend/src/libocl/src/ocl_substore.ll @@ -0,0 +1,9 @@ +target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir" + +declare void @__gen_ocl_sub_group_block_write_mem(i32 addrspace(1)* nocapture, i32) nounwind alwaysinline noduplicate + +define void @_Z27intel_sub_group_block_writePKU3AS1jj(i32 addrspace(1)* %p, i32 %data) nounwind alwaysinline noduplicate { + call void @__gen_ocl_sub_group_block_write_mem(i32 addrspace(1)* %p, i32 %data) + ret void +} diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl index a25dcef..66490cc 100644 --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl @@ -133,3 +133,57 @@ RANGE_OP(scan_exclusive, max, float, true) RANGE_OP(scan_exclusive, max, double, true) #undef RANGE_OP +PURE CONST uint __gen_ocl_sub_group_block_read_mem(const global uint* p); +OVERLOADABLE uint intel_sub_group_block_read(const global uint* p) +{ + return __gen_ocl_sub_group_block_read_mem(p); +} +OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p) +{ + return (uint2)(intel_sub_group_block_read(p), + intel_sub_group_block_read(p + get_simd_size())); +} +OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p) +{ + return (uint4)(intel_sub_group_block_read(p), + intel_sub_group_block_read(p + get_simd_size()), + intel_sub_group_block_read(p + get_simd_size() * 2), + intel_sub_group_block_read(p + get_simd_size() * 3)); + +} +OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p) +{ + return (uint8)(intel_sub_group_block_read(p), + intel_sub_group_block_read(p + get_simd_size()), + intel_sub_group_block_read(p + get_simd_size() * 2), + intel_sub_group_block_read(p + get_simd_size() * 3), + intel_sub_group_block_read(p + get_simd_size() * 4), + intel_sub_group_block_read(p + get_simd_size() * 5), + intel_sub_group_block_read(p + get_simd_size() * 6), + intel_sub_group_block_read(p + get_simd_size() * 7)); +} + +OVERLOADABLE void intel_sub_group_block_write2(const global uint* p, uint2 data) +{ + intel_sub_group_block_write(p, data.s0); + intel_sub_group_block_write(p + get_simd_size(), data.s1); +} +OVERLOADABLE void intel_sub_group_block_write4(const global uint* p,uint4 data) +{ + intel_sub_group_block_write(p, data.s0); + intel_sub_group_block_write(p + get_simd_size(), data.s1); + intel_sub_group_block_write(p + get_simd_size() * 2, data.s2); + intel_sub_group_block_write(p + get_simd_size() * 3, data.s3); + +} +OVERLOADABLE void intel_sub_group_block_write8(const global uint* p,uint8 data) +{ + intel_sub_group_block_write(p, data.s0); + intel_sub_group_block_write(p + get_simd_size(), data.s1); + intel_sub_group_block_write(p + get_simd_size() * 2, data.s2); + intel_sub_group_block_write(p + get_simd_size() * 3, data.s3); + intel_sub_group_block_write(p + get_simd_size() * 4, data.s4); + intel_sub_group_block_write(p + get_simd_size() * 5, data.s5); + intel_sub_group_block_write(p + get_simd_size() * 6, data.s6); + intel_sub_group_block_write(p + get_simd_size() * 7, data.s7); +} diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h index 355ee30..d0676be 100644 --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h @@ -132,3 +132,14 @@ OVERLOADABLE double sub_group_scan_exclusive_max(double x); OVERLOADABLE float intel_sub_group_shuffle(float x, uint c); OVERLOADABLE int intel_sub_group_shuffle(int x, uint c); OVERLOADABLE uint intel_sub_group_shuffle(uint x, uint c); + +/* blocak read/write */ +OVERLOADABLE uint intel_sub_group_block_read(const global uint* p); +OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p); +OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p); +OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p); + +OVERLOADABLE void intel_sub_group_block_write(const __global uint* p, uint data); +OVERLOADABLE void intel_sub_group_block_write2(const __global uint* p, uint2 data); +OVERLOADABLE void intel_sub_group_block_write4(const __global uint* p, uint4 data); +OVERLOADABLE void intel_sub_group_block_write8(const __global uint* p, uint8 data); diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index a091d7c..ffa838c 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -697,6 +697,8 @@ namespace gbe void emitWorkGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps opcode); // Emit subgroup instructions void emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps opcode); + // Emit subgroup instructions + void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite); uint8_t appendSampler(CallSite::arg_iterator AI); uint8_t getImageID(CallInst &I); @@ -3744,6 +3746,9 @@ namespace gbe case GEN_OCL_LRP: this->newRegister(&I); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM: + this->newRegister(&I, NULL, false); + break; case GEN_OCL_PRINTF: this->newRegister(&I); // fall through case GEN_OCL_PUTS: @@ -3758,6 +3763,7 @@ namespace gbe case GEN_OCL_CALC_TIMESTAMP: case GEN_OCL_STORE_PROFILING: case GEN_OCL_DEBUGWAIT: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM: break; case GEN_OCL_NOT_FOUND: default: @@ -3952,6 +3958,61 @@ namespace gbe GBE_ASSERT(AI == AE); } + void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite) { + CallSite::arg_iterator AI = CS.arg_begin(); + CallSite::arg_iterator AE = CS.arg_end(); + GBE_ASSERT(AI != AE); + + Value *llvmPtr = *(AI++); + Value *llvmValues; + ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmPtr->getType()->getPointerAddressSpace()); + GBE_ASSERT(addrSpace == ir::MEM_GLOBAL); + ir::Register pointer = this->getRegister(llvmPtr); + + ir::Register ptr; + ir::Register btiReg; + unsigned SurfaceIndex = 0xff; + + ir::AddressMode AM; + if (legacyMode) { + Value *bti = getBtiRegister(llvmPtr); + Value *ptrBase = getPointerBase(llvmPtr); + ir::Register baseReg = this->getRegister(ptrBase); + if (isa<ConstantInt>(bti)) { + AM = ir::AM_StaticBti; + SurfaceIndex = cast<ConstantInt>(bti)->getZExtValue(); + addrSpace = btiToGen(SurfaceIndex); + } else { + AM = ir::AM_DynamicBti; + addrSpace = ir::MEM_MIXED; + btiReg = this->getRegister(bti); + } + const ir::RegisterFamily pointerFamily = ctx.getPointerFamily(); + ptr = ctx.reg(pointerFamily); + ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg); + } else { + AM = ir::AM_Stateless; + ptr = pointer; + } + + ir::Type type = ir::TYPE_U32; + GBE_ASSERT(AM != ir::AM_DynamicBti); + + if(isWrite){ + llvmValues = *(AI++); + const ir::Register values = getRegister(llvmValues); + const ir::Tuple tuple = ctx.arrayTuple(&values, 1); + ctx.STORE(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex, true); + } else { + llvmValues = &I; + const ir::Register values = getRegister(llvmValues); + const ir::Tuple tuple = ctx.arrayTuple(&values, 1); + ctx.LOAD(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex, true); + } + + GBE_ASSERT(AI == AE); + } + /* append a new sampler. should be called before any reference to * a sampler_t value. */ uint8_t GenWriter::appendSampler(CallSite::arg_iterator AI) { @@ -4776,6 +4837,10 @@ namespace gbe ctx.LRP(ir::TYPE_FLOAT, dst, src0, src1, src2); break; } + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM: + this->emitBlockReadWriteMemInst(I, CS, false); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM: + this->emitBlockReadWriteMemInst(I, CS, true); break; default: break; } } diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx index 213ead0..003be91 100644 --- a/backend/src/llvm/llvm_gen_ocl_function.hxx +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx @@ -202,7 +202,7 @@ DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MIN, __gen_ocl_work_group_scan_ DECL_LLVM_GEN_FUNCTION(WORK_GROUP_ALL, __gen_ocl_work_group_all) DECL_LLVM_GEN_FUNCTION(WORK_GROUP_ANY, __gen_ocl_work_group_any) -// work group function +// sub group function DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BROADCAST, __gen_ocl_sub_group_broadcast) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_REDUCE_ADD, __gen_ocl_sub_group_reduce_add) @@ -217,5 +217,8 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_ADD, __gen_ocl_sub_group_scan_in DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MAX, __gen_ocl_sub_group_scan_inclusive_max) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MIN, __gen_ocl_sub_group_scan_inclusive_min) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM, __gen_ocl_sub_group_block_read_mem) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM, __gen_ocl_sub_group_block_write_mem) + // common function DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp) -- 2.7.4 _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet