Reviewed-by: Ilia Mirkin <[email protected]>
On Mon, Oct 24, 2016 at 3:41 PM, Samuel Pitoiset <[email protected]> wrote: > Shared memory is local to CTA, thus we should only wait for > prior memory writes which are visible to other threads in > the same CTA, and not at global level. This should speedup > compute shaders which use shared memory. > > v2: - do not use == > > Signed-off-by: Samuel Pitoiset <[email protected]> > --- > src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 5 ++++- > 1 file changed, 4 insertions(+), 1 deletion(-) > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp > b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp > index b47fc49..91cef81 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp > @@ -3561,12 +3561,15 @@ Converter::handleInstruction(const struct > tgsi_full_instruction *insn) > geni->subOp = tgsi::opcodeToSubOp(tgsi.getOpcode()); > break; > case TGSI_OPCODE_MEMBAR: > + { > + uint32_t level = tgsi.getSrc(0).getValueU32(0, info); > geni = mkOp(OP_MEMBAR, TYPE_NONE, NULL); > geni->fixed = 1; > - if (tgsi.getSrc(0).getValueU32(0, info) & TGSI_MEMBAR_THREAD_GROUP) > + if (!(level & ~(TGSI_MEMBAR_THREAD_GROUP | TGSI_MEMBAR_SHARED))) > geni->subOp = NV50_IR_SUBOP_MEMBAR(M, CTA); > else > geni->subOp = NV50_IR_SUBOP_MEMBAR(M, GL); > + } > break; > case TGSI_OPCODE_ATOMUADD: > case TGSI_OPCODE_ATOMXCHG: > -- > 2.10.1 > > _______________________________________________ > mesa-dev mailing list > [email protected] > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-dev
