Re: [Mesa-dev] [PATCHv2 4/4] anv/gen9: Optimize slice and subslice load balancing behavior.
Reviewed-by: Jason Ekstrand Thanks for figuring this out! On Mon, Aug 12, 2019 at 4:19 PM Francisco Jerez wrote: > See "i965/gen9: Optimize slice and subslice load balancing behavior." > for the rationale. According to Jason, improves Aztec Ruins > performance by 2.7%. > > Reviewed-by: Kenneth Graunke (v1) > > v2: Undo CPU performance micro-optimization done in i965 and iris due > to lack of data justifying it on anv. Use > cmd_buffer_apply_pipe_flushes wrapper instead of emitting pipe > control command directly. (Jason) > --- > src/intel/vulkan/anv_genX.h| 4 ++ > src/intel/vulkan/anv_private.h | 6 ++ > src/intel/vulkan/genX_blorp_exec.c | 4 ++ > src/intel/vulkan/genX_cmd_buffer.c | 95 ++ > 4 files changed, 109 insertions(+) > > diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h > index a5435e566a3..06c6b467acf 100644 > --- a/src/intel/vulkan/anv_genX.h > +++ b/src/intel/vulkan/anv_genX.h > @@ -44,6 +44,10 @@ void genX(cmd_buffer_apply_pipe_flushes)(struct > anv_cmd_buffer *cmd_buffer); > > void genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer > *cmd_buffer); > > +void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, > +unsigned width, unsigned height, > +unsigned scale); > + > void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer); > void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer); > > diff --git a/src/intel/vulkan/anv_private.h > b/src/intel/vulkan/anv_private.h > index 2465f264354..b381386a716 100644 > --- a/src/intel/vulkan/anv_private.h > +++ b/src/intel/vulkan/anv_private.h > @@ -2421,6 +2421,12 @@ struct anv_cmd_state { > > bool > conditional_render_enabled; > > + /** > +* Last rendering scale argument provided to > +* genX(cmd_buffer_emit_hashing_mode)(). > +*/ > + unsigned current_hash_scale; > + > /** > * Array length is anv_cmd_state::pass::attachment_count. Array > content is > * valid only when recording a render pass instance. > diff --git a/src/intel/vulkan/genX_blorp_exec.c > b/src/intel/vulkan/genX_blorp_exec.c > index 1592e7f7e3d..239bfb47433 100644 > --- a/src/intel/vulkan/genX_blorp_exec.c > +++ b/src/intel/vulkan/genX_blorp_exec.c > @@ -223,6 +223,10 @@ genX(blorp_exec)(struct blorp_batch *batch, >genX(cmd_buffer_config_l3)(cmd_buffer, cfg); > } > > + const unsigned scale = params->fast_clear_op ? UINT_MAX : 1; > + genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, params->x1 - params->x0, > + params->y1 - params->y0, scale); > + > #if GEN_GEN >= 11 > /* The PIPE_CONTROL command description says: > * > diff --git a/src/intel/vulkan/genX_cmd_buffer.c > b/src/intel/vulkan/genX_cmd_buffer.c > index 86ef1663ac4..10da132115c 100644 > --- a/src/intel/vulkan/genX_cmd_buffer.c > +++ b/src/intel/vulkan/genX_cmd_buffer.c > @@ -1595,6 +1595,7 @@ genX(CmdExecuteCommands)( > */ > primary->state.current_pipeline = UINT32_MAX; > primary->state.current_l3_config = NULL; > + primary->state.current_hash_scale = 0; > > /* Each of the secondary command buffers will use its own state base > * address. We need to re-emit state base address for the primary > after > @@ -2663,6 +2664,8 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer > *cmd_buffer) > > genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config); > > + genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1); > + > genX(flush_pipeline_select_3d)(cmd_buffer); > > if (vb_emit) { > @@ -3925,6 +3928,98 @@ genX(cmd_buffer_emit_gen7_depth_flush)(struct > anv_cmd_buffer *cmd_buffer) > } > } > > +/** > + * Update the pixel hashing modes that determine the balancing of PS > threads > + * across subslices and slices. > + * > + * \param width Width bound of the rendering area (already scaled down if > \p > + * scale is greater than 1). > + * \param height Height bound of the rendering area (already scaled down > if \p > + * scale is greater than 1). > + * \param scale The number of framebuffer samples that could potentially > be > + * affected by an individual channel of the PS thread. This > is > + * typically one for single-sampled rendering, but for > operations > + * like CCS resolves and fast clears a single PS invocation > may > + * update a huge number of pixels, in which case a finer > + * balancing is desirable in order to maximally utilize the > + * bandwidth available. UINT_MAX can be used as shorthand > for > + * "finest hashing mode available". > + */ > +void > +genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, > + unsigned width,
[Mesa-dev] [PATCHv2 4/4] anv/gen9: Optimize slice and subslice load balancing behavior.
See "i965/gen9: Optimize slice and subslice load balancing behavior." for the rationale. According to Jason, improves Aztec Ruins performance by 2.7%. Reviewed-by: Kenneth Graunke (v1) v2: Undo CPU performance micro-optimization done in i965 and iris due to lack of data justifying it on anv. Use cmd_buffer_apply_pipe_flushes wrapper instead of emitting pipe control command directly. (Jason) --- src/intel/vulkan/anv_genX.h| 4 ++ src/intel/vulkan/anv_private.h | 6 ++ src/intel/vulkan/genX_blorp_exec.c | 4 ++ src/intel/vulkan/genX_cmd_buffer.c | 95 ++ 4 files changed, 109 insertions(+) diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index a5435e566a3..06c6b467acf 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -44,6 +44,10 @@ void genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer); void genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer); +void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, +unsigned width, unsigned height, +unsigned scale); + void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer); void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 2465f264354..b381386a716 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -2421,6 +2421,12 @@ struct anv_cmd_state { bool conditional_render_enabled; + /** +* Last rendering scale argument provided to +* genX(cmd_buffer_emit_hashing_mode)(). +*/ + unsigned current_hash_scale; + /** * Array length is anv_cmd_state::pass::attachment_count. Array content is * valid only when recording a render pass instance. diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c index 1592e7f7e3d..239bfb47433 100644 --- a/src/intel/vulkan/genX_blorp_exec.c +++ b/src/intel/vulkan/genX_blorp_exec.c @@ -223,6 +223,10 @@ genX(blorp_exec)(struct blorp_batch *batch, genX(cmd_buffer_config_l3)(cmd_buffer, cfg); } + const unsigned scale = params->fast_clear_op ? UINT_MAX : 1; + genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, params->x1 - params->x0, + params->y1 - params->y0, scale); + #if GEN_GEN >= 11 /* The PIPE_CONTROL command description says: * diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 86ef1663ac4..10da132115c 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -1595,6 +1595,7 @@ genX(CmdExecuteCommands)( */ primary->state.current_pipeline = UINT32_MAX; primary->state.current_l3_config = NULL; + primary->state.current_hash_scale = 0; /* Each of the secondary command buffers will use its own state base * address. We need to re-emit state base address for the primary after @@ -2663,6 +2664,8 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config); + genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1); + genX(flush_pipeline_select_3d)(cmd_buffer); if (vb_emit) { @@ -3925,6 +3928,98 @@ genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer) } } +/** + * Update the pixel hashing modes that determine the balancing of PS threads + * across subslices and slices. + * + * \param width Width bound of the rendering area (already scaled down if \p + * scale is greater than 1). + * \param height Height bound of the rendering area (already scaled down if \p + * scale is greater than 1). + * \param scale The number of framebuffer samples that could potentially be + * affected by an individual channel of the PS thread. This is + * typically one for single-sampled rendering, but for operations + * like CCS resolves and fast clears a single PS invocation may + * update a huge number of pixels, in which case a finer + * balancing is desirable in order to maximally utilize the + * bandwidth available. UINT_MAX can be used as shorthand for + * "finest hashing mode available". + */ +void +genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, + unsigned width, unsigned height, + unsigned scale) +{ +#if GEN_GEN == 9 + const struct gen_device_info *devinfo = _buffer->device->info; + const unsigned slice_hashing[] = { + /* Because all Gen9 platforms with more than one slice require + * three-way subslice hashing, a