Re: [Mesa-dev] [PATCH] i965/blorp: Special-case the clear color in MSAA resolves

2016-05-11 Thread Jason Ekstrand
I need to recind this patch.  I thought it worked but it's far more
half-baked than I realized.  There's some issue with swizzles interacting
with the clear color. :-(
--Jason

On Tue, May 10, 2016 at 9:45 PM, Jason Ekstrand 
wrote:

> The current MSAA resolve code has a special-case for if the MCS value is 0.
> In this case we can only sample once because we know that all values are in
> slice 0.  This commit adds a second optimization that detecs the magic MCS
> value that indicates the clear color and grabs the color from a push
> constant and avoids sampling altogether.  On a microbenchmark written by
> Neil Roberts that tests resolving surfaces with just clear color, this
> improves performance by 60% for 8x, 40% for 4x, and 28% for 2x MSAA on my
> SKL gte3 laptop.  The benchmark can be found on the ML archive:
>
> https://lists.freedesktop.org/archives/mesa-dev/2016-February/108077.html
> ---
>  src/mesa/drivers/dri/i965/brw_blorp.h|  4 +-
>  src/mesa/drivers/dri/i965/brw_blorp_blit.cpp | 72
> ++--
>  2 files changed, 71 insertions(+), 5 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_blorp.h
> b/src/mesa/drivers/dri/i965/brw_blorp.h
> index 5f7569c..550c6c5 100644
> --- a/src/mesa/drivers/dri/i965/brw_blorp.h
> +++ b/src/mesa/drivers/dri/i965/brw_blorp.h
> @@ -197,7 +197,9 @@ struct brw_blorp_wm_push_constants
> uint32_t src_z;
>
> /* Pad out to an integral number of registers */
> -   uint32_t pad[5];
> +   uint32_t pad;
> +
> +   union gl_color_union clear_color;
>  };
>
>  #define BRW_BLORP_NUM_PUSH_CONSTANT_DWORDS \
> diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
> b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
> index 97e3908..314034e 100644
> --- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
> @@ -346,6 +346,7 @@ struct brw_blorp_blit_vars {
>nir_variable *offset;
> } u_x_transform, u_y_transform;
> nir_variable *u_src_z;
> +   nir_variable *u_clear_color;
>
> /* gl_FragCoord */
> nir_variable *frag_coord;
> @@ -374,6 +375,7 @@ brw_blorp_blit_vars_init(nir_builder *b, struct
> brw_blorp_blit_vars *v,
> LOAD_UNIFORM(y_transform.multiplier, glsl_float_type())
> LOAD_UNIFORM(y_transform.offset, glsl_float_type())
> LOAD_UNIFORM(src_z, glsl_uint_type())
> +   LOAD_UNIFORM(clear_color, glsl_vec4_type())
>
>  #undef DECL_UNIFORM
>
> @@ -858,7 +860,8 @@ static nir_ssa_def *
>  blorp_nir_manual_blend_average(nir_builder *b, nir_ssa_def *pos,
> unsigned tex_samples,
> enum intel_msaa_layout tex_layout,
> -   enum brw_reg_type dst_type)
> +   enum brw_reg_type dst_type,
> +   struct brw_blorp_blit_vars *v)
>  {
> /* If non-null, this is the outer-most if statement */
> nir_if *outer_if = NULL;
> @@ -867,9 +870,53 @@ blorp_nir_manual_blend_average(nir_builder *b,
> nir_ssa_def *pos,
>nir_local_variable_create(b->impl, glsl_vec4_type(), "color");
>
> nir_ssa_def *mcs = NULL;
> -   if (tex_layout == INTEL_MSAA_LAYOUT_CMS)
> +   if (tex_layout == INTEL_MSAA_LAYOUT_CMS) {
>mcs = blorp_nir_txf_ms_mcs(b, pos);
>
> +  /* The MCS buffer stores a packed value that provides a mapping from
> +   * samples to array slices.  The magic value of all ones means that
> all
> +   * samples have the clear color.  In this case, we can
> short-circuit the
> +   * sampling process and just use the clear color that we pushed
> into the
> +   * shader.
> +   */
> +  nir_ssa_def *is_clear_color;
> +  switch (tex_samples) {
> +  case 2:
> + /* Empirical evidence suggests that the value returned from the
> +  * sampler is not always 0x3 for clear color so we need to mask
> it.
> +  */
> + is_clear_color =
> +nir_ieq(b, nir_iand(b, nir_channel(b, mcs, 0), nir_imm_int(b,
> 0x3)),
> +   nir_imm_int(b, 0x3));
> + break;
> +  case 4:
> + is_clear_color =
> +nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, 0xff));
> + break;
> +  case 8:
> + is_clear_color =
> +nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, ~0));
> + break;
> +  case 16:
> + is_clear_color =
> +nir_ior(b, nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b,
> ~0)),
> +   nir_ieq(b, nir_channel(b, mcs, 1), nir_imm_int(b,
> ~0)));
> + break;
> +  default:
> + unreachable("Invalid sample count");
> +  }
> +
> +  nir_if *if_stmt = nir_if_create(b->shader);
> +  if_stmt->condition = nir_src_for_ssa(is_clear_color);
> +  nir_cf_node_insert(b->cursor, _stmt->cf_node);
> +
> +  b->cursor = nir_after_cf_list(_stmt->then_list);
> +  nir_store_var(b, color, nir_load_var(b, 

[Mesa-dev] [PATCH] i965/blorp: Special-case the clear color in MSAA resolves

2016-05-10 Thread Jason Ekstrand
The current MSAA resolve code has a special-case for if the MCS value is 0.
In this case we can only sample once because we know that all values are in
slice 0.  This commit adds a second optimization that detecs the magic MCS
value that indicates the clear color and grabs the color from a push
constant and avoids sampling altogether.  On a microbenchmark written by
Neil Roberts that tests resolving surfaces with just clear color, this
improves performance by 60% for 8x, 40% for 4x, and 28% for 2x MSAA on my
SKL gte3 laptop.  The benchmark can be found on the ML archive:

https://lists.freedesktop.org/archives/mesa-dev/2016-February/108077.html
---
 src/mesa/drivers/dri/i965/brw_blorp.h|  4 +-
 src/mesa/drivers/dri/i965/brw_blorp_blit.cpp | 72 ++--
 2 files changed, 71 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_blorp.h 
b/src/mesa/drivers/dri/i965/brw_blorp.h
index 5f7569c..550c6c5 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.h
+++ b/src/mesa/drivers/dri/i965/brw_blorp.h
@@ -197,7 +197,9 @@ struct brw_blorp_wm_push_constants
uint32_t src_z;
 
/* Pad out to an integral number of registers */
-   uint32_t pad[5];
+   uint32_t pad;
+
+   union gl_color_union clear_color;
 };
 
 #define BRW_BLORP_NUM_PUSH_CONSTANT_DWORDS \
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp 
b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
index 97e3908..314034e 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
@@ -346,6 +346,7 @@ struct brw_blorp_blit_vars {
   nir_variable *offset;
} u_x_transform, u_y_transform;
nir_variable *u_src_z;
+   nir_variable *u_clear_color;
 
/* gl_FragCoord */
nir_variable *frag_coord;
@@ -374,6 +375,7 @@ brw_blorp_blit_vars_init(nir_builder *b, struct 
brw_blorp_blit_vars *v,
LOAD_UNIFORM(y_transform.multiplier, glsl_float_type())
LOAD_UNIFORM(y_transform.offset, glsl_float_type())
LOAD_UNIFORM(src_z, glsl_uint_type())
+   LOAD_UNIFORM(clear_color, glsl_vec4_type())
 
 #undef DECL_UNIFORM
 
@@ -858,7 +860,8 @@ static nir_ssa_def *
 blorp_nir_manual_blend_average(nir_builder *b, nir_ssa_def *pos,
unsigned tex_samples,
enum intel_msaa_layout tex_layout,
-   enum brw_reg_type dst_type)
+   enum brw_reg_type dst_type,
+   struct brw_blorp_blit_vars *v)
 {
/* If non-null, this is the outer-most if statement */
nir_if *outer_if = NULL;
@@ -867,9 +870,53 @@ blorp_nir_manual_blend_average(nir_builder *b, nir_ssa_def 
*pos,
   nir_local_variable_create(b->impl, glsl_vec4_type(), "color");
 
nir_ssa_def *mcs = NULL;
-   if (tex_layout == INTEL_MSAA_LAYOUT_CMS)
+   if (tex_layout == INTEL_MSAA_LAYOUT_CMS) {
   mcs = blorp_nir_txf_ms_mcs(b, pos);
 
+  /* The MCS buffer stores a packed value that provides a mapping from
+   * samples to array slices.  The magic value of all ones means that all
+   * samples have the clear color.  In this case, we can short-circuit the
+   * sampling process and just use the clear color that we pushed into the
+   * shader.
+   */
+  nir_ssa_def *is_clear_color;
+  switch (tex_samples) {
+  case 2:
+ /* Empirical evidence suggests that the value returned from the
+  * sampler is not always 0x3 for clear color so we need to mask it.
+  */
+ is_clear_color =
+nir_ieq(b, nir_iand(b, nir_channel(b, mcs, 0), nir_imm_int(b, 
0x3)),
+   nir_imm_int(b, 0x3));
+ break;
+  case 4:
+ is_clear_color =
+nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, 0xff));
+ break;
+  case 8:
+ is_clear_color =
+nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, ~0));
+ break;
+  case 16:
+ is_clear_color =
+nir_ior(b, nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, ~0)),
+   nir_ieq(b, nir_channel(b, mcs, 1), nir_imm_int(b, ~0)));
+ break;
+  default:
+ unreachable("Invalid sample count");
+  }
+
+  nir_if *if_stmt = nir_if_create(b->shader);
+  if_stmt->condition = nir_src_for_ssa(is_clear_color);
+  nir_cf_node_insert(b->cursor, _stmt->cf_node);
+
+  b->cursor = nir_after_cf_list(_stmt->then_list);
+  nir_store_var(b, color, nir_load_var(b, v->u_clear_color), 0xf);
+
+  b->cursor = nir_after_cf_list(_stmt->else_list);
+  outer_if = if_stmt;
+   }
+
/* We add together samples using a binary tree structure, e.g. for 4x MSAA:
 *
 *   result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4
@@ -937,7 +984,8 @@ blorp_nir_manual_blend_average(nir_builder *b, nir_ssa_def 
*pos,
  nir_store_var(b, color, texture_data[0], 0xf);
 
  b->cursor =