Re: [Mesa-dev] [PATCH v3] nir: add support for removing redundant stores to copy prop var

2018-11-12 Thread Jason Ekstrand
On Mon, Nov 12, 2018 at 8:57 PM Timothy Arceri 
wrote:

> For example the following type of thing is seen in TCS from
> a number of Vulkan and DXVK games:
>
> vec1 32 ssa_557 = deref_var  (shader_out float)
> vec1 32 ssa_558 = intrinsic load_deref (ssa_557) ()
> vec1 32 ssa_559 = deref_var @42 (shader_out float)
> vec1 32 ssa_560 = intrinsic load_deref (ssa_559) ()
> vec1 32 ssa_561 = deref_var @43 (shader_out float)
> vec1 32 ssa_562 = intrinsic load_deref (ssa_561) ()
> intrinsic store_deref (ssa_557, ssa_558) (1) /* wrmask=x */
> intrinsic store_deref (ssa_559, ssa_560) (1) /* wrmask=x */
> intrinsic store_deref (ssa_561, ssa_562) (1) /* wrmask=x */
>
> No shader-db changes on i965 (SKL).
>
> vkpipeline-db results RADV (VEGA):
>
> Totals from affected shaders:
> SGPRS: 7832 -> 7728 (-1.33 %)
> VGPRS: 6476 -> 6740 (4.08 %)
> Spilled SGPRs: 0 -> 0 (0.00 %)
> Spilled VGPRs: 0 -> 0 (0.00 %)
> Private memory VGPRs: 0 -> 0 (0.00 %)
> Scratch size: 0 -> 0 (0.00 %) dwords per thread
> Code Size: 469572 -> 456596 (-2.76 %) bytes
> LDS: 0 -> 0 (0.00 %) blocks
> Max Waves: 989 -> 960 (-2.93 %)
> Wait states: 0 -> 0 (0.00 %)
>
> The Max Waves and VGPRS changes here are misleading. What is
> happening is a bunch of TCS outputs are being optimised away as
> they are now recognised as unused. This results in more varyings
> being compacted via nir_compact_varyings() which can result in
> more register pressure when they are not packed in an optimal way.
> This is an existing problem independent of this patch. I've run
> some benchmarks and haven't noticed any performance regressions
> in affected games.
>
> V2: check all ssa values in case another store has altered the
> value of a component.
> v3: only check components that are actually written
> ---
>  src/compiler/nir/nir_opt_copy_prop_vars.c | 45 ++-
>  1 file changed, 35 insertions(+), 10 deletions(-)
>
> diff --git a/src/compiler/nir/nir_opt_copy_prop_vars.c
> b/src/compiler/nir/nir_opt_copy_prop_vars.c
> index 7a21ad56c79..4631c1b0223 100644
> --- a/src/compiler/nir/nir_opt_copy_prop_vars.c
> +++ b/src/compiler/nir/nir_opt_copy_prop_vars.c
> @@ -86,6 +86,21 @@ struct copy_prop_var_state {
> bool progress;
>  };
>
> +static bool
> +value_equals_ssa(struct value *value, nir_intrinsic_instr *intrin)
>

Maybe call it value_equals_store_src and name the intrinsic input "store"?
That's a bit more descriptive.  With that,

Reviewed-by: Jason Ekstrand 


> +{
> +   assert(intrin->intrinsic == nir_intrinsic_store_deref);
> +   uintptr_t write_mask = nir_intrinsic_write_mask(intrin);
> +
> +   for (unsigned i = 0; i < intrin->num_components; i++) {
> +  if ((write_mask & (1 << i)) &&
> +  value->ssa[i] != intrin->src[1].ssa)
> + return false;
> +   }
> +
> +   return true;
> +}
> +
>  static struct vars_written *
>  create_vars_written(struct copy_prop_var_state *state)
>  {
> @@ -676,18 +691,28 @@ copy_prop_vars_block(struct copy_prop_var_state
> *state,
>}
>
>case nir_intrinsic_store_deref: {
> - struct value value = {
> -.is_ssa = true
> - };
> -
> - for (unsigned i = 0; i < intrin->num_components; i++)
> -value.ssa[i] = intrin->src[1].ssa;
> -
>   nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]);
> - unsigned wrmask = nir_intrinsic_write_mask(intrin);
>   struct copy_entry *entry =
> -get_entry_and_kill_aliases(copies, dst, wrmask);
> - store_to_entry(state, entry, , wrmask);
> +lookup_entry_for_deref(copies, dst, nir_derefs_equal_bit);
> + if (entry && value_equals_ssa(>src, intrin)) {
> +/* If we are storing the value from a load of the same var the
> + * store is redundant so remove it.
> + */
> +nir_instr_remove(instr);
> + } else {
> +struct value value = {
> +   .is_ssa = true
> +};
> +
> +for (unsigned i = 0; i < intrin->num_components; i++)
> +   value.ssa[i] = intrin->src[1].ssa;
> +
> +unsigned wrmask = nir_intrinsic_write_mask(intrin);
> +struct copy_entry *entry =
> +   get_entry_and_kill_aliases(copies, dst, wrmask);
> +store_to_entry(state, entry, , wrmask);
> + }
> +
>   break;
>}
>
> --
> 2.19.1
>
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3] nir: add support for removing redundant stores to copy prop var

2018-11-12 Thread Timothy Arceri
For example the following type of thing is seen in TCS from
a number of Vulkan and DXVK games:

vec1 32 ssa_557 = deref_var  (shader_out float)
vec1 32 ssa_558 = intrinsic load_deref (ssa_557) ()
vec1 32 ssa_559 = deref_var @42 (shader_out float)
vec1 32 ssa_560 = intrinsic load_deref (ssa_559) ()
vec1 32 ssa_561 = deref_var @43 (shader_out float)
vec1 32 ssa_562 = intrinsic load_deref (ssa_561) ()
intrinsic store_deref (ssa_557, ssa_558) (1) /* wrmask=x */
intrinsic store_deref (ssa_559, ssa_560) (1) /* wrmask=x */
intrinsic store_deref (ssa_561, ssa_562) (1) /* wrmask=x */

No shader-db changes on i965 (SKL).

vkpipeline-db results RADV (VEGA):

Totals from affected shaders:
SGPRS: 7832 -> 7728 (-1.33 %)
VGPRS: 6476 -> 6740 (4.08 %)
Spilled SGPRs: 0 -> 0 (0.00 %)
Spilled VGPRs: 0 -> 0 (0.00 %)
Private memory VGPRs: 0 -> 0 (0.00 %)
Scratch size: 0 -> 0 (0.00 %) dwords per thread
Code Size: 469572 -> 456596 (-2.76 %) bytes
LDS: 0 -> 0 (0.00 %) blocks
Max Waves: 989 -> 960 (-2.93 %)
Wait states: 0 -> 0 (0.00 %)

The Max Waves and VGPRS changes here are misleading. What is
happening is a bunch of TCS outputs are being optimised away as
they are now recognised as unused. This results in more varyings
being compacted via nir_compact_varyings() which can result in
more register pressure when they are not packed in an optimal way.
This is an existing problem independent of this patch. I've run
some benchmarks and haven't noticed any performance regressions
in affected games.

V2: check all ssa values in case another store has altered the
value of a component.
v3: only check components that are actually written
---
 src/compiler/nir/nir_opt_copy_prop_vars.c | 45 ++-
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/src/compiler/nir/nir_opt_copy_prop_vars.c 
b/src/compiler/nir/nir_opt_copy_prop_vars.c
index 7a21ad56c79..4631c1b0223 100644
--- a/src/compiler/nir/nir_opt_copy_prop_vars.c
+++ b/src/compiler/nir/nir_opt_copy_prop_vars.c
@@ -86,6 +86,21 @@ struct copy_prop_var_state {
bool progress;
 };
 
+static bool
+value_equals_ssa(struct value *value, nir_intrinsic_instr *intrin)
+{
+   assert(intrin->intrinsic == nir_intrinsic_store_deref);
+   uintptr_t write_mask = nir_intrinsic_write_mask(intrin);
+
+   for (unsigned i = 0; i < intrin->num_components; i++) {
+  if ((write_mask & (1 << i)) &&
+  value->ssa[i] != intrin->src[1].ssa)
+ return false;
+   }
+
+   return true;
+}
+
 static struct vars_written *
 create_vars_written(struct copy_prop_var_state *state)
 {
@@ -676,18 +691,28 @@ copy_prop_vars_block(struct copy_prop_var_state *state,
   }
 
   case nir_intrinsic_store_deref: {
- struct value value = {
-.is_ssa = true
- };
-
- for (unsigned i = 0; i < intrin->num_components; i++)
-value.ssa[i] = intrin->src[1].ssa;
-
  nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]);
- unsigned wrmask = nir_intrinsic_write_mask(intrin);
  struct copy_entry *entry =
-get_entry_and_kill_aliases(copies, dst, wrmask);
- store_to_entry(state, entry, , wrmask);
+lookup_entry_for_deref(copies, dst, nir_derefs_equal_bit);
+ if (entry && value_equals_ssa(>src, intrin)) {
+/* If we are storing the value from a load of the same var the
+ * store is redundant so remove it.
+ */
+nir_instr_remove(instr);
+ } else {
+struct value value = {
+   .is_ssa = true
+};
+
+for (unsigned i = 0; i < intrin->num_components; i++)
+   value.ssa[i] = intrin->src[1].ssa;
+
+unsigned wrmask = nir_intrinsic_write_mask(intrin);
+struct copy_entry *entry =
+   get_entry_and_kill_aliases(copies, dst, wrmask);
+store_to_entry(state, entry, , wrmask);
+ }
+
  break;
   }
 
-- 
2.19.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev