Hi Xiuli, Yes, it is good to keep the number as 6, and I find that I forgot to modify syncStr. If there is no any other issue, I will send v2 to fix it.
Thanks! Ruiling > -----Original Message----- > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of > Xiuli Pan > Sent: Tuesday, April 5, 2016 1:33 PM > To: Song, Ruiling <ruiling.s...@intel.com> > Cc: beignet@lists.freedesktop.org > Subject: Re: [Beignet] [PATCH 3/7] GBE: add ocl 2.0 work_group_barrier > support. > > Hi Ruiling, > > The patch set LGTM, only a small problem about syncFieldNum. > > Thanks > Xiuli > > > On Fri, Apr 01, 2016 at 02:53:24PM +0800, Ruiling Song wrote: > > to do an image barrier, we need to: > > 1. flush L3 RW cache. > > 2. do a barrier gateway. > > 3. flush sampler cache. > > > > Note the fence argument maybe ORed together. > > We need to support non-immediate barrier() argument in future. > > > > Signed-off-by: Ruiling Song <ruiling.s...@intel.com> > > --- > > backend/src/backend/gen8_encoder.cpp | 24 > ++++++++++++++++++++++++ > > backend/src/backend/gen8_encoder.hpp | 2 ++ > > backend/src/backend/gen8_instruction.hpp | 6 +++++- > > backend/src/backend/gen9_context.cpp | 9 +++++++-- > > backend/src/backend/gen_context.cpp | 11 ++++++++--- > > backend/src/backend/gen_defs.hpp | 1 + > > backend/src/backend/gen_encoder.cpp | 8 ++++++-- > > backend/src/backend/gen_encoder.hpp | 3 ++- > > backend/src/ir/instruction.cpp | 3 ++- > > backend/src/ir/instruction.hpp | 6 ++++-- > > backend/src/libocl/include/ocl_sync.h | 1 + > > backend/src/libocl/src/ocl_barrier.ll | 27 ++------------------------- > > backend/src/libocl/src/ocl_sync.cl | 1 - > > backend/src/llvm/llvm_gen_backend.cpp | 28 > ++++++++++++++++++++++++++-- > > backend/src/llvm/llvm_gen_ocl_function.hxx | 2 +- > > 15 files changed, 91 insertions(+), 41 deletions(-) > > > > diff --git a/backend/src/backend/gen8_encoder.cpp > b/backend/src/backend/gen8_encoder.cpp > > index 9af8cee..a71e44a 100644 > > --- a/backend/src/backend/gen8_encoder.cpp > > +++ b/backend/src/backend/gen8_encoder.cpp > > @@ -494,6 +494,30 @@ namespace gbe > > > > this->setSrc1(&insn, GenRegister::immd(jip*8)); > > } > > + void Gen8Encoder::FENCE(GenRegister dst, bool flushRWCache) { > > + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); > > + Gen8NativeInstruction *gen8_insn = &insn->gen8_insn; > > + this->setHeader(insn); > > + this->setDst(insn, dst); > > + this->setSrc0(insn, dst); > > + setMessageDescriptor(insn, GEN_SFID_DATAPORT_DATA, 1, 1, 1); > > + gen8_insn->bits3.gen7_memory_fence.msg_type = GEN_MEM_FENCE; > > + gen8_insn->bits3.gen7_memory_fence.commit_enable = 0x1; > > + gen8_insn->bits3.gen7_memory_fence.flush_rw = flushRWCache ? 1 : 0; > > + } > > + > > + void Gen8Encoder::FLUSH_SAMPLERCACHE(GenRegister dst) { > > + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); > > + this->setHeader(insn); > > + this->setDst(insn, dst); > > + this->setSrc0(insn, GenRegister::ud8grf(0,0)); > > + unsigned msg_type = GEN_SAMPLER_MESSAGE_CACHE_FLUSH; > > + unsigned simd_mode = GEN_SAMPLER_SIMD_MODE_SIMD32_64; > > + setSamplerMessage(insn, 0, 0, msg_type, > > + 1, 1, > > + true, > > + simd_mode, 0); > > + } > > > > void Gen8Encoder::setDst(GenNativeInstruction *insn, GenRegister dest) { > > Gen8NativeInstruction *gen8_insn = &insn->gen8_insn; > > diff --git a/backend/src/backend/gen8_encoder.hpp > b/backend/src/backend/gen8_encoder.hpp > > index d83cde5..81a2e1f 100644 > > --- a/backend/src/backend/gen8_encoder.hpp > > +++ b/backend/src/backend/gen8_encoder.hpp > > @@ -38,6 +38,7 @@ namespace gbe > > > > /*! Jump indexed instruction */ > > virtual void JMPI(GenRegister src, bool longjmp = false); > > + virtual void FENCE(GenRegister dst, bool flushRWCache); > > /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump > distance */ > > virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip); > > virtual void F16TO32(GenRegister dest, GenRegister src0); > > @@ -59,6 +60,7 @@ namespace gbe > > virtual void setTypedWriteMessage(GenNativeInstruction *insn, unsigned > char bti, > > unsigned char msg_type, uint32_t > > msg_length, > > bool header_present); > > + virtual void FLUSH_SAMPLERCACHE(GenRegister dst); > > virtual void setDst(GenNativeInstruction *insn, GenRegister dest); > > virtual void setSrc0(GenNativeInstruction *insn, GenRegister reg); > > virtual void setSrc1(GenNativeInstruction *insn, GenRegister reg); > > diff --git a/backend/src/backend/gen8_instruction.hpp > b/backend/src/backend/gen8_instruction.hpp > > index b45376d..67e5dc5 100644 > > --- a/backend/src/backend/gen8_instruction.hpp > > +++ b/backend/src/backend/gen8_instruction.hpp > > @@ -540,7 +540,11 @@ union Gen8NativeInstruction > > /*! Memory fence */ > > struct { > > uint32_t bti:8; > > - uint32_t pad:5; > > + uint32_t pad:1; > > + uint32_t flush_instruction:1; > > + uint32_t flush_texture:1; > > + uint32_t flush_constant:1; > > + uint32_t flush_rw:1; > > uint32_t commit_enable:1; > > uint32_t msg_type:4; > > uint32_t pad2:1; > > diff --git a/backend/src/backend/gen9_context.cpp > b/backend/src/backend/gen9_context.cpp > > index 326f5a1..aa0c174 100644 > > --- a/backend/src/backend/gen9_context.cpp > > +++ b/backend/src/backend/gen9_context.cpp > > @@ -34,9 +34,10 @@ namespace gbe > > const GenRegister fenceDst = ra->genReg(insn.dst(0)); > > uint32_t barrierType = insn.extra.barrierType; > > const GenRegister barrierId = ra- > >genReg(GenRegister::ud1grf(ir::ocl::barrierid)); > > + bool imageFence = barrierType & ir::SYNC_IMAGE_FENCE; > > > > - if (barrierType == ir::syncGlobalBarrier) { > > - p->FENCE(fenceDst); > > + if (barrierType & ir::SYNC_GLOBAL_READ_FENCE) { > > + p->FENCE(fenceDst, imageFence); > > p->MOV(fenceDst, fenceDst); > > } > > p->push(); > > @@ -53,5 +54,9 @@ namespace gbe > > // Now we wait for the other threads > > p->WAIT(); > > p->pop(); > > + if (imageFence) { > > + p->FLUSH_SAMPLERCACHE(fenceDst); > > + p->MOV(fenceDst, fenceDst); > > + } > > } > > } > > diff --git a/backend/src/backend/gen_context.cpp > b/backend/src/backend/gen_context.cpp > > index e55c25e..be78c02 100644 > > --- a/backend/src/backend/gen_context.cpp > > +++ b/backend/src/backend/gen_context.cpp > > @@ -1834,9 +1834,10 @@ namespace gbe > > const GenRegister fenceDst = ra->genReg(insn.dst(0)); > > uint32_t barrierType = insn.extra.barrierType; > > const GenRegister barrierId = ra- > >genReg(GenRegister::ud1grf(ir::ocl::barrierid)); > > + bool imageFence = barrierType & ir::SYNC_IMAGE_FENCE; > > > > - if (barrierType == ir::syncGlobalBarrier) { > > - p->FENCE(fenceDst); > > + if (barrierType & ir::SYNC_GLOBAL_READ_FENCE) { > > + p->FENCE(fenceDst, imageFence); > > p->MOV(fenceDst, fenceDst); > > } > > p->push(); > > @@ -1853,11 +1854,15 @@ namespace gbe > > // Now we wait for the other threads > > p->WAIT(); > > p->pop(); > > + if (imageFence) { > > + p->FLUSH_SAMPLERCACHE(fenceDst); > > + p->MOV(fenceDst, fenceDst); > > + } > > } > > > > void GenContext::emitFenceInstruction(const SelectionInstruction &insn) { > > const GenRegister dst = ra->genReg(insn.dst(0)); > > - p->FENCE(dst); > > + p->FENCE(dst, false); > > p->MOV(dst, dst); > > } > > > > diff --git a/backend/src/backend/gen_defs.hpp > b/backend/src/backend/gen_defs.hpp > > index 60ebdd6..271aaaf 100644 > > --- a/backend/src/backend/gen_defs.hpp > > +++ b/backend/src/backend/gen_defs.hpp > > @@ -424,6 +424,7 @@ enum GenMessageTarget { > > #define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE 6 > > #define GEN5_SAMPLER_MESSAGE_SAMPLE_LD 7 > > #define GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO 10 > > +#define GEN_SAMPLER_MESSAGE_CACHE_FLUSH 0x1f > > > > /* for GEN5 only */ > > #define GEN_SAMPLER_SIMD_MODE_SIMD4X2 0 > > diff --git a/backend/src/backend/gen_encoder.cpp > b/backend/src/backend/gen_encoder.cpp > > index d3e1936..8ca4a5e 100644 > > --- a/backend/src/backend/gen_encoder.cpp > > +++ b/backend/src/backend/gen_encoder.cpp > > @@ -203,7 +203,7 @@ namespace gbe > > unsigned msg_length, unsigned > > response_length, > > bool header_present, bool > > end_of_thread) > > { > > - setSrc1(inst, GenRegister::immd(0)); > > + setSrc1(inst, GenRegister::immud(0)); > > inst->bits3.generic_gen5.header_present = header_present; > > inst->bits3.generic_gen5.response_length = response_length; > > inst->bits3.generic_gen5.msg_length = msg_length; > > @@ -957,7 +957,7 @@ namespace gbe > > insn->bits3.msg_gateway.notify = notifyN; > > } > > > > - void GenEncoder::FENCE(GenRegister dst) { > > + void GenEncoder::FENCE(GenRegister dst, bool flushRWCache) { > > GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); > > this->setHeader(insn); > > this->setDst(insn, dst); > > @@ -1226,6 +1226,10 @@ namespace gbe > > header_present, > > simd_mode, return_format); > > } > > + void GenEncoder::FLUSH_SAMPLERCACHE(GenRegister dst) { > > + // only Gen8+ support flushing sampler cache > > + assert(0); > > + } > > > > void GenEncoder::TYPED_WRITE(GenRegister msg, bool header_present, > unsigned char bti) > > { > > diff --git a/backend/src/backend/gen_encoder.hpp > b/backend/src/backend/gen_encoder.hpp > > index a4bf90c..d30d55c 100644 > > --- a/backend/src/backend/gen_encoder.hpp > > +++ b/backend/src/backend/gen_encoder.hpp > > @@ -144,7 +144,7 @@ namespace gbe > > /*! Forward the gateway message. */ > > void FWD_GATEWAY_MSG(GenRegister src, uint32_t notifyN = 0); > > /*! Memory fence message (to order loads and stores between threads) */ > > - void FENCE(GenRegister dst); > > + virtual void FENCE(GenRegister dst, bool flushRWCache); > > /*! Jump indexed instruction */ > > virtual void JMPI(GenRegister src, bool longjmp = false); > > /*! IF indexed instruction */ > > @@ -216,6 +216,7 @@ namespace gbe > > bool header_present, > > uint32_t simd_mode, > > uint32_t return_format); > > + virtual void FLUSH_SAMPLERCACHE(GenRegister dst); > > > > /*! TypedWrite instruction for texture */ > > virtual void TYPED_WRITE(GenRegister header, > > diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp > > index 415c255..9b3f0cd 100644 > > --- a/backend/src/ir/instruction.cpp > > +++ b/backend/src/ir/instruction.cpp > > @@ -1202,7 +1202,8 @@ namespace ir { > > SYNC_LOCAL_READ_FENCE | > > SYNC_LOCAL_WRITE_FENCE | > > SYNC_GLOBAL_READ_FENCE | > > - SYNC_GLOBAL_WRITE_FENCE; > > + SYNC_GLOBAL_WRITE_FENCE | > > + SYNC_IMAGE_FENCE; > > if (UNLIKELY(this->parameters > maxParams)) { > > whyNot = "Invalid parameters for sync instruction"; > > return false; > > diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp > > index 1017c19..467192c 100644 > > --- a/backend/src/ir/instruction.hpp > > +++ b/backend/src/ir/instruction.hpp > > @@ -503,17 +503,19 @@ namespace ir { > > SYNC_LOCAL_WRITE_FENCE = 1<<2, > > SYNC_GLOBAL_READ_FENCE = 1<<3, > > SYNC_GLOBAL_WRITE_FENCE = 1<<4, > > - SYNC_INVALID = 1<<5 > > + SYNC_IMAGE_FENCE = 1<<5, > > + SYNC_INVALID = 1<<6 > > }; > > > > /*! 5 bits to encode all possible synchronization capablities */ > > - static const uint32_t syncFieldNum = 5u; > > + static const uint32_t syncFieldNum = 7u; > > It seem this should be 6u if you only added one SYNC_IMAGE_FENCE > > > > > /*! When barrier(CLK_LOCAL_MEM_FENCE) is issued */ > > static const uint32_t syncLocalBarrier = SYNC_WORKGROUP_EXEC > |SYNC_LOCAL_WRITE_FENCE | SYNC_LOCAL_READ_FENCE; > > > > /*! When barrier(CLK_GLOBAL_MEM_FENCE) is issued */ > > static const uint32_t syncGlobalBarrier = SYNC_WORKGROUP_EXEC | > SYNC_GLOBAL_WRITE_FENCE | SYNC_GLOBAL_READ_FENCE; > > + static const uint32_t syncImageBarrier = SYNC_WORKGROUP_EXEC | > SYNC_GLOBAL_WRITE_FENCE | SYNC_GLOBAL_READ_FENCE | > SYNC_IMAGE_FENCE; > > > > /*! Sync instructions are used to order loads and stores for a given > > memory > > * space and/or to serialize threads at a given point in the program > > diff --git a/backend/src/libocl/include/ocl_sync.h > b/backend/src/libocl/include/ocl_sync.h > > index a2f0764..1487353 100644 > > --- a/backend/src/libocl/include/ocl_sync.h > > +++ b/backend/src/libocl/include/ocl_sync.h > > @@ -27,5 +27,6 @@ OVERLOADABLE void barrier(cl_mem_fence_flags flags); > > OVERLOADABLE void mem_fence(cl_mem_fence_flags flags); > > OVERLOADABLE void read_mem_fence(cl_mem_fence_flags flags); > > OVERLOADABLE void write_mem_fence(cl_mem_fence_flags flags); > > +#define work_group_barrier barrier > > cl_mem_fence_flags get_fence(void *ptr); > > #endif /* __OCL_SYNC_H__ */ > > diff --git a/backend/src/libocl/src/ocl_barrier.ll > b/backend/src/libocl/src/ocl_barrier.ll > > index 31087ba..009bd21 100644 > > --- a/backend/src/libocl/src/ocl_barrier.ll > > +++ b/backend/src/libocl/src/ocl_barrier.ll > > @@ -11,32 +11,9 @@ declare i32 @_get_local_mem_fence() nounwind > alwaysinline > > declare i32 @_get_global_mem_fence() nounwind alwaysinline > > declare void @__gen_ocl_barrier_local() nounwind alwaysinline noduplicate > > declare void @__gen_ocl_barrier_global() nounwind alwaysinline noduplicate > > -declare void @__gen_ocl_barrier_local_and_global() nounwind alwaysinline > noduplicate > > +declare void @__gen_ocl_barrier(i32) nounwind alwaysinline noduplicate > > > > define void @_Z7barrierj(i32 %flags) nounwind noduplicate alwaysinline { > > - %1 = icmp eq i32 %flags, 3 > > - br i1 %1, label %barrier_local_global, label %barrier_local_check > > - > > -barrier_local_global: > > - call void @__gen_ocl_barrier_local_and_global() > > - br label %done > > - > > -barrier_local_check: > > - %2 = icmp eq i32 %flags, 1 > > - br i1 %2, label %barrier_local, label %barrier_global_check > > - > > -barrier_local: > > - call void @__gen_ocl_barrier_local() > > - br label %done > > - > > -barrier_global_check: > > - %3 = icmp eq i32 %flags, 2 > > - br i1 %3, label %barrier_global, label %done > > - > > -barrier_global: > > - call void @__gen_ocl_barrier_global() > > - br label %done > > - > > -done: > > + call void @__gen_ocl_barrier(i32 %flags) > > ret void > > } > > diff --git a/backend/src/libocl/src/ocl_sync.cl > b/backend/src/libocl/src/ocl_sync.cl > > index b85db3d..cac5cfd 100644 > > --- a/backend/src/libocl/src/ocl_sync.cl > > +++ b/backend/src/libocl/src/ocl_sync.cl > > @@ -20,7 +20,6 @@ > > > > void __gen_ocl_barrier_local(void); > > void __gen_ocl_barrier_global(void); > > -void __gen_ocl_barrier_local_and_global(void); > > > > OVERLOADABLE void mem_fence(cl_mem_fence_flags flags) { > > } > > diff --git a/backend/src/llvm/llvm_gen_backend.cpp > b/backend/src/llvm/llvm_gen_backend.cpp > > index a746d80..2b6875c 100644 > > --- a/backend/src/llvm/llvm_gen_backend.cpp > > +++ b/backend/src/llvm/llvm_gen_backend.cpp > > @@ -3654,7 +3654,7 @@ namespace gbe > > case GEN_OCL_FORCE_SIMD16: > > case GEN_OCL_LBARRIER: > > case GEN_OCL_GBARRIER: > > - case GEN_OCL_LGBARRIER: > > + case GEN_OCL_BARRIER: > > ctx.getFunction().setUseSLM(true); > > break; > > case GEN_OCL_WRITE_IMAGE_I: > > @@ -4344,7 +4344,31 @@ namespace gbe > > case GEN_OCL_FORCE_SIMD16: ctx.setSimdWidth(16); break; > > case GEN_OCL_LBARRIER: ctx.SYNC(ir::syncLocalBarrier); break; > > case GEN_OCL_GBARRIER: ctx.SYNC(ir::syncGlobalBarrier); break; > > - case GEN_OCL_LGBARRIER: ctx.SYNC(ir::syncLocalBarrier | > ir::syncGlobalBarrier); break; > > + case GEN_OCL_BARRIER: > > + { > > + Constant *CPV = dyn_cast<Constant>(*AI); > > + unsigned syncFlag = 0; > > + if (CPV) { > > + const ir::Immediate &x = processConstantImm(CPV); > > + unsigned barrierArg = x.getIntegerValue(); > > + if (barrierArg & 0x1) { > > + syncFlag |= ir::syncLocalBarrier; > > + } > > + if (barrierArg & 0x2) { > > + syncFlag |= ir::syncGlobalBarrier; > > + } > > + if (barrierArg & 0x4) { > > + syncFlag |= ir::syncImageBarrier; > > + } > > + } else { > > + // FIXME we default it to do global fence and barrier. > > + // we need to do runtime check here. > > + syncFlag = ir::syncLocalBarrier | ir::syncGlobalBarrier; > > + } > > + > > + ctx.SYNC(syncFlag); > > + break; > > + } > > case GEN_OCL_ATOMIC_ADD0: > > case GEN_OCL_ATOMIC_ADD1: this- > >emitAtomicInst(I,CS,ir::ATOMIC_OP_ADD); break; > > case GEN_OCL_ATOMIC_SUB0: > > diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx > b/backend/src/llvm/llvm_gen_ocl_function.hxx > > index 09feb1a..a47afdf 100644 > > --- a/backend/src/llvm/llvm_gen_ocl_function.hxx > > +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx > > @@ -30,7 +30,7 @@ DECL_LLVM_GEN_FUNCTION(FMIN, __gen_ocl_fmin) > > // Barrier function > > DECL_LLVM_GEN_FUNCTION(LBARRIER, __gen_ocl_barrier_local) > > DECL_LLVM_GEN_FUNCTION(GBARRIER, __gen_ocl_barrier_global) > > -DECL_LLVM_GEN_FUNCTION(LGBARRIER, > __gen_ocl_barrier_local_and_global) > > +DECL_LLVM_GEN_FUNCTION(BARRIER, __gen_ocl_barrier) > > > > // To force SIMD8/16 compilation > > DECL_LLVM_GEN_FUNCTION(FORCE_SIMD8, __gen_ocl_force_simd8) > > -- > > 2.4.1 > > > > _______________________________________________ > > Beignet mailing list > > Beignet@lists.freedesktop.org > > https://lists.freedesktop.org/mailman/listinfo/beignet > _______________________________________________ > Beignet mailing list > Beignet@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet