Re: [Mesa-dev] [PATCH v2 17/20] radeonsi: do not do two full flushes on every compute dispatch
For all patches that don't contain my Rb except 17: Reviewed-by: Marek Olšák SURFACE_SYNC & ACQUIRE_MEM only wait for VS and PS if any of the BASE_ENA flags are set (according to our closed driver code). They don't wait for CS. Flushing L2 doesn't wait for anything. Marek On Wed, Apr 13, 2016 at 9:30 PM, Bas Nieuwenhuizen wrote: > v2: Add more CS_PARTIAL_FLUSH events. > > Essentially every place with waits on finishing for pixel shaders > also has a write after read hazard with compute shaders. > > Invalidating L2 waits implicitly on pixel and compute shaders, > so, we don't need a CS_PARTIAL_FLUSH for switching FBO. > > Signed-off-by: Bas Nieuwenhuizen > --- > src/gallium/drivers/radeonsi/si_compute.c | 17 ++--- > src/gallium/drivers/radeonsi/si_cp_dma.c | 6 -- > src/gallium/drivers/radeonsi/si_descriptors.c | 3 ++- > src/gallium/drivers/radeonsi/si_state.c | 6 -- > 4 files changed, 12 insertions(+), 20 deletions(-) > > diff --git a/src/gallium/drivers/radeonsi/si_compute.c > b/src/gallium/drivers/radeonsi/si_compute.c > index 10b88b3..6803334 100644 > --- a/src/gallium/drivers/radeonsi/si_compute.c > +++ b/src/gallium/drivers/radeonsi/si_compute.c > @@ -439,13 +439,8 @@ static void si_launch_grid( > if (!sctx->cs_shader_state.initialized) > si_initialize_compute(sctx); > > - sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | > -SI_CONTEXT_INV_GLOBAL_L2 | > -SI_CONTEXT_INV_ICACHE | > -SI_CONTEXT_INV_SMEM_L1 | > -SI_CONTEXT_FLUSH_WITH_INV_L2 | > -SI_CONTEXT_FLAG_COMPUTE; > - si_emit_cache_flush(sctx, NULL); > + if (sctx->b.flags) > + si_emit_cache_flush(sctx, NULL); > > if (!si_switch_compute_shader(sctx, program, &program->shader, > info->pc)) > return; > @@ -478,14 +473,6 @@ static void si_launch_grid( > si_setup_tgsi_grid(sctx, info); > > si_emit_dispatch_packets(sctx, info); > - > - sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | > -SI_CONTEXT_INV_VMEM_L1 | > -SI_CONTEXT_INV_GLOBAL_L2 | > -SI_CONTEXT_INV_ICACHE | > -SI_CONTEXT_INV_SMEM_L1 | > -SI_CONTEXT_FLAG_COMPUTE; > - si_emit_cache_flush(sctx, NULL); > } > > > diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c > b/src/gallium/drivers/radeonsi/si_cp_dma.c > index 001ddd4..38e0ee6 100644 > --- a/src/gallium/drivers/radeonsi/si_cp_dma.c > +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c > @@ -190,7 +190,8 @@ static void si_clear_buffer(struct pipe_context *ctx, > struct pipe_resource *dst, > uint64_t va = r600_resource(dst)->gpu_address + offset; > > /* Flush the caches. */ > - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags; > + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | > +SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags; > > while (size) { > unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); > @@ -296,7 +297,8 @@ void si_copy_buffer(struct si_context *sctx, > } > > /* Flush the caches. */ > - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags; > + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | > +SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags; > > /* This is the main part doing the copying. Src is always aligned. */ > main_dst_offset = dst_offset + skipped_size; > diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c > b/src/gallium/drivers/radeonsi/si_descriptors.c > index a2c096f..04dada6 100644 > --- a/src/gallium/drivers/radeonsi/si_descriptors.c > +++ b/src/gallium/drivers/radeonsi/si_descriptors.c > @@ -940,7 +940,8 @@ static void si_set_streamout_targets(struct pipe_context > *ctx, > * start writing to the targets. > */ > if (num_targets) > - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; > + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | > +SI_CONTEXT_CS_PARTIAL_FLUSH; > > /* Streamout buffers must be bound in 2 places: > * 1) in VGT by setting the VGT_STRMOUT registers > diff --git a/src/gallium/drivers/radeonsi/si_state.c > b/src/gallium/drivers/radeonsi/si_state.c > index 82ae4c4..a62dc52 100644 > --- a/src/gallium/drivers/radeonsi/si_state.c > +++ b/src/gallium/drivers/radeonsi/si_state.c > @@ -3467,7 +3467,8 @@ static void si_memory_barrier(struct pipe_context *ctx, > unsigned flags) > > /* Subsequent commands must wait for all shader invocations to > * complete. */ > - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; > + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | > +SI_CONTEXT_CS_PARTIAL_FLUSH; > > if (flags & PI
[Mesa-dev] [PATCH v2 17/20] radeonsi: do not do two full flushes on every compute dispatch
v2: Add more CS_PARTIAL_FLUSH events. Essentially every place with waits on finishing for pixel shaders also has a write after read hazard with compute shaders. Invalidating L2 waits implicitly on pixel and compute shaders, so, we don't need a CS_PARTIAL_FLUSH for switching FBO. Signed-off-by: Bas Nieuwenhuizen --- src/gallium/drivers/radeonsi/si_compute.c | 17 ++--- src/gallium/drivers/radeonsi/si_cp_dma.c | 6 -- src/gallium/drivers/radeonsi/si_descriptors.c | 3 ++- src/gallium/drivers/radeonsi/si_state.c | 6 -- 4 files changed, 12 insertions(+), 20 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 10b88b3..6803334 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -439,13 +439,8 @@ static void si_launch_grid( if (!sctx->cs_shader_state.initialized) si_initialize_compute(sctx); - sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | -SI_CONTEXT_INV_GLOBAL_L2 | -SI_CONTEXT_INV_ICACHE | -SI_CONTEXT_INV_SMEM_L1 | -SI_CONTEXT_FLUSH_WITH_INV_L2 | -SI_CONTEXT_FLAG_COMPUTE; - si_emit_cache_flush(sctx, NULL); + if (sctx->b.flags) + si_emit_cache_flush(sctx, NULL); if (!si_switch_compute_shader(sctx, program, &program->shader, info->pc)) return; @@ -478,14 +473,6 @@ static void si_launch_grid( si_setup_tgsi_grid(sctx, info); si_emit_dispatch_packets(sctx, info); - - sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | -SI_CONTEXT_INV_VMEM_L1 | -SI_CONTEXT_INV_GLOBAL_L2 | -SI_CONTEXT_INV_ICACHE | -SI_CONTEXT_INV_SMEM_L1 | -SI_CONTEXT_FLAG_COMPUTE; - si_emit_cache_flush(sctx, NULL); } diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 001ddd4..38e0ee6 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -190,7 +190,8 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, uint64_t va = r600_resource(dst)->gpu_address + offset; /* Flush the caches. */ - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags; + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | +SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags; while (size) { unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); @@ -296,7 +297,8 @@ void si_copy_buffer(struct si_context *sctx, } /* Flush the caches. */ - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags; + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | +SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags; /* This is the main part doing the copying. Src is always aligned. */ main_dst_offset = dst_offset + skipped_size; diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index a2c096f..04dada6 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -940,7 +940,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx, * start writing to the targets. */ if (num_targets) - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | +SI_CONTEXT_CS_PARTIAL_FLUSH; /* Streamout buffers must be bound in 2 places: * 1) in VGT by setting the VGT_STRMOUT registers diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 82ae4c4..a62dc52 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -3467,7 +3467,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) /* Subsequent commands must wait for all shader invocations to * complete. */ - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | +SI_CONTEXT_CS_PARTIAL_FLUSH; if (flags & PIPE_BARRIER_CONSTANT_BUFFER) sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 | @@ -3477,7 +3478,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) PIPE_BARRIER_SHADER_BUFFER | PIPE_BARRIER_TEXTURE | PIPE_BARRIER_IMAGE | -PIPE_BARRIER_STREAMOUT_BUFFER)) { +PIPE_BARRIER_STREAMOUT_BUFFER | +PIPE_BARRIER_GLOBAL_BUFFER)) { /* As far as I can tell, L1 contents