On Mon, Oct 24, 2016 at 10:29 AM, Samuel Pitoiset <[email protected]> wrote: > Shared memory is local to CTA, thus we should only wait for > prior memory writes which are visible to other threads in > the same CTA, and not at global level. This should speedup > compute shaders which use shared memory. > > Signed-off-by: Samuel Pitoiset <[email protected]> > --- > src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 5 ++++- > 1 file changed, 4 insertions(+), 1 deletion(-) > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp > b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp > index b47fc49..621a468 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp > @@ -3561,12 +3561,15 @@ Converter::handleInstruction(const struct > tgsi_full_instruction *insn) > geni->subOp = tgsi::opcodeToSubOp(tgsi.getOpcode()); > break; > case TGSI_OPCODE_MEMBAR: > + { > + uint32_t level = tgsi.getSrc(0).getValueU32(0, info); > geni = mkOp(OP_MEMBAR, TYPE_NONE, NULL); > geni->fixed = 1; > - if (tgsi.getSrc(0).getValueU32(0, info) & TGSI_MEMBAR_THREAD_GROUP) > + if ((level & TGSI_MEMBAR_THREAD_GROUP) || level == TGSI_MEMBAR_SHARED)
Probably just level & (A | B) > geni->subOp = NV50_IR_SUBOP_MEMBAR(M, CTA); > else > geni->subOp = NV50_IR_SUBOP_MEMBAR(M, GL); > + } > break; > case TGSI_OPCODE_ATOMUADD: > case TGSI_OPCODE_ATOMXCHG: > -- > 2.10.1 > > _______________________________________________ > mesa-dev mailing list > [email protected] > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-dev
