Re: [Mesa-dev] [PATCH v2 17/20] radeonsi: do not do two full flushes on every compute dispatch

2016-04-15 Thread Marek Olšák
For all patches that don't contain my Rb except 17:

Reviewed-by: Marek Olšák 

SURFACE_SYNC & ACQUIRE_MEM only wait for VS and PS if any of the
BASE_ENA flags are set (according to our closed driver code). They
don't wait for CS. Flushing L2 doesn't wait for anything.

Marek

On Wed, Apr 13, 2016 at 9:30 PM, Bas Nieuwenhuizen
 wrote:
> v2: Add more CS_PARTIAL_FLUSH events.
>
> Essentially every place with waits on finishing for pixel shaders
> also has a write after read hazard with compute shaders.
>
> Invalidating L2 waits implicitly on pixel and compute shaders,
> so, we don't need a CS_PARTIAL_FLUSH for switching FBO.
>
> Signed-off-by: Bas Nieuwenhuizen 
> ---
>  src/gallium/drivers/radeonsi/si_compute.c | 17 ++---
>  src/gallium/drivers/radeonsi/si_cp_dma.c  |  6 --
>  src/gallium/drivers/radeonsi/si_descriptors.c |  3 ++-
>  src/gallium/drivers/radeonsi/si_state.c   |  6 --
>  4 files changed, 12 insertions(+), 20 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
> b/src/gallium/drivers/radeonsi/si_compute.c
> index 10b88b3..6803334 100644
> --- a/src/gallium/drivers/radeonsi/si_compute.c
> +++ b/src/gallium/drivers/radeonsi/si_compute.c
> @@ -439,13 +439,8 @@ static void si_launch_grid(
> if (!sctx->cs_shader_state.initialized)
> si_initialize_compute(sctx);
>
> -   sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
> -SI_CONTEXT_INV_GLOBAL_L2 |
> -SI_CONTEXT_INV_ICACHE |
> -SI_CONTEXT_INV_SMEM_L1 |
> -SI_CONTEXT_FLUSH_WITH_INV_L2 |
> -SI_CONTEXT_FLAG_COMPUTE;
> -   si_emit_cache_flush(sctx, NULL);
> +   if (sctx->b.flags)
> +   si_emit_cache_flush(sctx, NULL);
>
> if (!si_switch_compute_shader(sctx, program, &program->shader, 
> info->pc))
> return;
> @@ -478,14 +473,6 @@ static void si_launch_grid(
> si_setup_tgsi_grid(sctx, info);
>
> si_emit_dispatch_packets(sctx, info);
> -
> -   sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
> -SI_CONTEXT_INV_VMEM_L1 |
> -SI_CONTEXT_INV_GLOBAL_L2 |
> -SI_CONTEXT_INV_ICACHE |
> -SI_CONTEXT_INV_SMEM_L1 |
> -SI_CONTEXT_FLAG_COMPUTE;
> -   si_emit_cache_flush(sctx, NULL);
>  }
>
>
> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c 
> b/src/gallium/drivers/radeonsi/si_cp_dma.c
> index 001ddd4..38e0ee6 100644
> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
> @@ -190,7 +190,8 @@ static void si_clear_buffer(struct pipe_context *ctx, 
> struct pipe_resource *dst,
> uint64_t va = r600_resource(dst)->gpu_address + offset;
>
> /* Flush the caches. */
> -   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
> +   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
> +SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
>
> while (size) {
> unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
> @@ -296,7 +297,8 @@ void si_copy_buffer(struct si_context *sctx,
> }
>
> /* Flush the caches. */
> -   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
> +   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
> +SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
>
> /* This is the main part doing the copying. Src is always aligned. */
> main_dst_offset = dst_offset + skipped_size;
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
> b/src/gallium/drivers/radeonsi/si_descriptors.c
> index a2c096f..04dada6 100644
> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
> @@ -940,7 +940,8 @@ static void si_set_streamout_targets(struct pipe_context 
> *ctx,
>  * start writing to the targets.
>  */
> if (num_targets)
> -   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
> +   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
> +SI_CONTEXT_CS_PARTIAL_FLUSH;
>
> /* Streamout buffers must be bound in 2 places:
>  * 1) in VGT by setting the VGT_STRMOUT registers
> diff --git a/src/gallium/drivers/radeonsi/si_state.c 
> b/src/gallium/drivers/radeonsi/si_state.c
> index 82ae4c4..a62dc52 100644
> --- a/src/gallium/drivers/radeonsi/si_state.c
> +++ b/src/gallium/drivers/radeonsi/si_state.c
> @@ -3467,7 +3467,8 @@ static void si_memory_barrier(struct pipe_context *ctx, 
> unsigned flags)
>
> /* Subsequent commands must wait for all shader invocations to
>  * complete. */
> -   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
> +   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
> +SI_CONTEXT_CS_PARTIAL_FLUSH;
>
> if (flags & PI

[Mesa-dev] [PATCH v2 17/20] radeonsi: do not do two full flushes on every compute dispatch

2016-04-13 Thread Bas Nieuwenhuizen
v2: Add more CS_PARTIAL_FLUSH events.

Essentially every place with waits on finishing for pixel shaders
also has a write after read hazard with compute shaders.

Invalidating L2 waits implicitly on pixel and compute shaders,
so, we don't need a CS_PARTIAL_FLUSH for switching FBO.

Signed-off-by: Bas Nieuwenhuizen 
---
 src/gallium/drivers/radeonsi/si_compute.c | 17 ++---
 src/gallium/drivers/radeonsi/si_cp_dma.c  |  6 --
 src/gallium/drivers/radeonsi/si_descriptors.c |  3 ++-
 src/gallium/drivers/radeonsi/si_state.c   |  6 --
 4 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 10b88b3..6803334 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -439,13 +439,8 @@ static void si_launch_grid(
if (!sctx->cs_shader_state.initialized)
si_initialize_compute(sctx);
 
-   sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
-SI_CONTEXT_INV_GLOBAL_L2 |
-SI_CONTEXT_INV_ICACHE |
-SI_CONTEXT_INV_SMEM_L1 |
-SI_CONTEXT_FLUSH_WITH_INV_L2 |
-SI_CONTEXT_FLAG_COMPUTE;
-   si_emit_cache_flush(sctx, NULL);
+   if (sctx->b.flags)
+   si_emit_cache_flush(sctx, NULL);
 
if (!si_switch_compute_shader(sctx, program, &program->shader, 
info->pc))
return;
@@ -478,14 +473,6 @@ static void si_launch_grid(
si_setup_tgsi_grid(sctx, info);
 
si_emit_dispatch_packets(sctx, info);
-
-   sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-SI_CONTEXT_INV_VMEM_L1 |
-SI_CONTEXT_INV_GLOBAL_L2 |
-SI_CONTEXT_INV_ICACHE |
-SI_CONTEXT_INV_SMEM_L1 |
-SI_CONTEXT_FLAG_COMPUTE;
-   si_emit_cache_flush(sctx, NULL);
 }
 
 
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c 
b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 001ddd4..38e0ee6 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -190,7 +190,8 @@ static void si_clear_buffer(struct pipe_context *ctx, 
struct pipe_resource *dst,
uint64_t va = r600_resource(dst)->gpu_address + offset;
 
/* Flush the caches. */
-   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
+   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
 
while (size) {
unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
@@ -296,7 +297,8 @@ void si_copy_buffer(struct si_context *sctx,
}
 
/* Flush the caches. */
-   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
+   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
 
/* This is the main part doing the copying. Src is always aligned. */
main_dst_offset = dst_offset + skipped_size;
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index a2c096f..04dada6 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -940,7 +940,8 @@ static void si_set_streamout_targets(struct pipe_context 
*ctx,
 * start writing to the targets.
 */
if (num_targets)
-   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+SI_CONTEXT_CS_PARTIAL_FLUSH;
 
/* Streamout buffers must be bound in 2 places:
 * 1) in VGT by setting the VGT_STRMOUT registers
diff --git a/src/gallium/drivers/radeonsi/si_state.c 
b/src/gallium/drivers/radeonsi/si_state.c
index 82ae4c4..a62dc52 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3467,7 +3467,8 @@ static void si_memory_barrier(struct pipe_context *ctx, 
unsigned flags)
 
/* Subsequent commands must wait for all shader invocations to
 * complete. */
-   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+   sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+SI_CONTEXT_CS_PARTIAL_FLUSH;
 
if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
@@ -3477,7 +3478,8 @@ static void si_memory_barrier(struct pipe_context *ctx, 
unsigned flags)
 PIPE_BARRIER_SHADER_BUFFER |
 PIPE_BARRIER_TEXTURE |
 PIPE_BARRIER_IMAGE |
-PIPE_BARRIER_STREAMOUT_BUFFER)) {
+PIPE_BARRIER_STREAMOUT_BUFFER |
+PIPE_BARRIER_GLOBAL_BUFFER)) {
/* As far as I can tell, L1 contents