Re: [Mesa-dev] [PATCHv2 4/4] anv/gen9: Optimize slice and subslice load balancing behavior.

2019-08-12 Thread Jason Ekstrand
Reviewed-by: Jason Ekstrand 

Thanks for figuring this out!

On Mon, Aug 12, 2019 at 4:19 PM Francisco Jerez 
wrote:

> See "i965/gen9: Optimize slice and subslice load balancing behavior."
> for the rationale.  According to Jason, improves Aztec Ruins
> performance by 2.7%.
>
> Reviewed-by: Kenneth Graunke  (v1)
>
> v2: Undo CPU performance micro-optimization done in i965 and iris due
> to lack of data justifying it on anv.  Use
> cmd_buffer_apply_pipe_flushes wrapper instead of emitting pipe
> control command directly.  (Jason)
> ---
>  src/intel/vulkan/anv_genX.h|  4 ++
>  src/intel/vulkan/anv_private.h |  6 ++
>  src/intel/vulkan/genX_blorp_exec.c |  4 ++
>  src/intel/vulkan/genX_cmd_buffer.c | 95 ++
>  4 files changed, 109 insertions(+)
>
> diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
> index a5435e566a3..06c6b467acf 100644
> --- a/src/intel/vulkan/anv_genX.h
> +++ b/src/intel/vulkan/anv_genX.h
> @@ -44,6 +44,10 @@ void genX(cmd_buffer_apply_pipe_flushes)(struct
> anv_cmd_buffer *cmd_buffer);
>
>  void genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer
> *cmd_buffer);
>
> +void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
> +unsigned width, unsigned height,
> +unsigned scale);
> +
>  void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer);
>  void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer);
>
> diff --git a/src/intel/vulkan/anv_private.h
> b/src/intel/vulkan/anv_private.h
> index 2465f264354..b381386a716 100644
> --- a/src/intel/vulkan/anv_private.h
> +++ b/src/intel/vulkan/anv_private.h
> @@ -2421,6 +2421,12 @@ struct anv_cmd_state {
>
> bool
>  conditional_render_enabled;
>
> +   /**
> +* Last rendering scale argument provided to
> +* genX(cmd_buffer_emit_hashing_mode)().
> +*/
> +   unsigned current_hash_scale;
> +
> /**
>  * Array length is anv_cmd_state::pass::attachment_count. Array
> content is
>  * valid only when recording a render pass instance.
> diff --git a/src/intel/vulkan/genX_blorp_exec.c
> b/src/intel/vulkan/genX_blorp_exec.c
> index 1592e7f7e3d..239bfb47433 100644
> --- a/src/intel/vulkan/genX_blorp_exec.c
> +++ b/src/intel/vulkan/genX_blorp_exec.c
> @@ -223,6 +223,10 @@ genX(blorp_exec)(struct blorp_batch *batch,
>genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
> }
>
> +   const unsigned scale = params->fast_clear_op ? UINT_MAX : 1;
> +   genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, params->x1 - params->x0,
> +  params->y1 - params->y0, scale);
> +
>  #if GEN_GEN >= 11
> /* The PIPE_CONTROL command description says:
>  *
> diff --git a/src/intel/vulkan/genX_cmd_buffer.c
> b/src/intel/vulkan/genX_cmd_buffer.c
> index 86ef1663ac4..10da132115c 100644
> --- a/src/intel/vulkan/genX_cmd_buffer.c
> +++ b/src/intel/vulkan/genX_cmd_buffer.c
> @@ -1595,6 +1595,7 @@ genX(CmdExecuteCommands)(
>  */
> primary->state.current_pipeline = UINT32_MAX;
> primary->state.current_l3_config = NULL;
> +   primary->state.current_hash_scale = 0;
>
> /* Each of the secondary command buffers will use its own state base
>  * address.  We need to re-emit state base address for the primary
> after
> @@ -2663,6 +2664,8 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer
> *cmd_buffer)
>
> genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config);
>
> +   genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
> +
> genX(flush_pipeline_select_3d)(cmd_buffer);
>
> if (vb_emit) {
> @@ -3925,6 +3928,98 @@ genX(cmd_buffer_emit_gen7_depth_flush)(struct
> anv_cmd_buffer *cmd_buffer)
> }
>  }
>
> +/**
> + * Update the pixel hashing modes that determine the balancing of PS
> threads
> + * across subslices and slices.
> + *
> + * \param width Width bound of the rendering area (already scaled down if
> \p
> + *  scale is greater than 1).
> + * \param height Height bound of the rendering area (already scaled down
> if \p
> + *   scale is greater than 1).
> + * \param scale The number of framebuffer samples that could potentially
> be
> + *  affected by an individual channel of the PS thread.  This
> is
> + *  typically one for single-sampled rendering, but for
> operations
> + *  like CCS resolves and fast clears a single PS invocation
> may
> + *  update a huge number of pixels, in which case a finer
> + *  balancing is desirable in order to maximally utilize the
> + *  bandwidth available.  UINT_MAX can be used as shorthand
> for
> + *  "finest hashing mode available".
> + */
> +void
> +genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
> +   unsigned width, 

[Mesa-dev] [PATCHv2 4/4] anv/gen9: Optimize slice and subslice load balancing behavior.

2019-08-12 Thread Francisco Jerez
See "i965/gen9: Optimize slice and subslice load balancing behavior."
for the rationale.  According to Jason, improves Aztec Ruins
performance by 2.7%.

Reviewed-by: Kenneth Graunke  (v1)

v2: Undo CPU performance micro-optimization done in i965 and iris due
to lack of data justifying it on anv.  Use
cmd_buffer_apply_pipe_flushes wrapper instead of emitting pipe
control command directly.  (Jason)
---
 src/intel/vulkan/anv_genX.h|  4 ++
 src/intel/vulkan/anv_private.h |  6 ++
 src/intel/vulkan/genX_blorp_exec.c |  4 ++
 src/intel/vulkan/genX_cmd_buffer.c | 95 ++
 4 files changed, 109 insertions(+)

diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
index a5435e566a3..06c6b467acf 100644
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@@ -44,6 +44,10 @@ void genX(cmd_buffer_apply_pipe_flushes)(struct 
anv_cmd_buffer *cmd_buffer);
 
 void genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer);
 
+void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
+unsigned width, unsigned height,
+unsigned scale);
+
 void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer);
 void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer);
 
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 2465f264354..b381386a716 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -2421,6 +2421,12 @@ struct anv_cmd_state {
 
bool conditional_render_enabled;
 
+   /**
+* Last rendering scale argument provided to
+* genX(cmd_buffer_emit_hashing_mode)().
+*/
+   unsigned current_hash_scale;
+
/**
 * Array length is anv_cmd_state::pass::attachment_count. Array content is
 * valid only when recording a render pass instance.
diff --git a/src/intel/vulkan/genX_blorp_exec.c 
b/src/intel/vulkan/genX_blorp_exec.c
index 1592e7f7e3d..239bfb47433 100644
--- a/src/intel/vulkan/genX_blorp_exec.c
+++ b/src/intel/vulkan/genX_blorp_exec.c
@@ -223,6 +223,10 @@ genX(blorp_exec)(struct blorp_batch *batch,
   genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
}
 
+   const unsigned scale = params->fast_clear_op ? UINT_MAX : 1;
+   genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, params->x1 - params->x0,
+  params->y1 - params->y0, scale);
+
 #if GEN_GEN >= 11
/* The PIPE_CONTROL command description says:
 *
diff --git a/src/intel/vulkan/genX_cmd_buffer.c 
b/src/intel/vulkan/genX_cmd_buffer.c
index 86ef1663ac4..10da132115c 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -1595,6 +1595,7 @@ genX(CmdExecuteCommands)(
 */
primary->state.current_pipeline = UINT32_MAX;
primary->state.current_l3_config = NULL;
+   primary->state.current_hash_scale = 0;
 
/* Each of the secondary command buffers will use its own state base
 * address.  We need to re-emit state base address for the primary after
@@ -2663,6 +2664,8 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer 
*cmd_buffer)
 
genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config);
 
+   genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
+
genX(flush_pipeline_select_3d)(cmd_buffer);
 
if (vb_emit) {
@@ -3925,6 +3928,98 @@ genX(cmd_buffer_emit_gen7_depth_flush)(struct 
anv_cmd_buffer *cmd_buffer)
}
 }
 
+/**
+ * Update the pixel hashing modes that determine the balancing of PS threads
+ * across subslices and slices.
+ *
+ * \param width Width bound of the rendering area (already scaled down if \p
+ *  scale is greater than 1).
+ * \param height Height bound of the rendering area (already scaled down if \p
+ *   scale is greater than 1).
+ * \param scale The number of framebuffer samples that could potentially be
+ *  affected by an individual channel of the PS thread.  This is
+ *  typically one for single-sampled rendering, but for operations
+ *  like CCS resolves and fast clears a single PS invocation may
+ *  update a huge number of pixels, in which case a finer
+ *  balancing is desirable in order to maximally utilize the
+ *  bandwidth available.  UINT_MAX can be used as shorthand for
+ *  "finest hashing mode available".
+ */
+void
+genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
+   unsigned width, unsigned height,
+   unsigned scale)
+{
+#if GEN_GEN == 9
+   const struct gen_device_info *devinfo = _buffer->device->info;
+   const unsigned slice_hashing[] = {
+  /* Because all Gen9 platforms with more than one slice require
+   * three-way subslice hashing, a