Re: [Mesa-dev] [PATCH] i965/blorp: Special-case the clear color in MSAA resolves
I need to recind this patch. I thought it worked but it's far more half-baked than I realized. There's some issue with swizzles interacting with the clear color. :-( --Jason On Tue, May 10, 2016 at 9:45 PM, Jason Ekstrandwrote: > The current MSAA resolve code has a special-case for if the MCS value is 0. > In this case we can only sample once because we know that all values are in > slice 0. This commit adds a second optimization that detecs the magic MCS > value that indicates the clear color and grabs the color from a push > constant and avoids sampling altogether. On a microbenchmark written by > Neil Roberts that tests resolving surfaces with just clear color, this > improves performance by 60% for 8x, 40% for 4x, and 28% for 2x MSAA on my > SKL gte3 laptop. The benchmark can be found on the ML archive: > > https://lists.freedesktop.org/archives/mesa-dev/2016-February/108077.html > --- > src/mesa/drivers/dri/i965/brw_blorp.h| 4 +- > src/mesa/drivers/dri/i965/brw_blorp_blit.cpp | 72 > ++-- > 2 files changed, 71 insertions(+), 5 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_blorp.h > b/src/mesa/drivers/dri/i965/brw_blorp.h > index 5f7569c..550c6c5 100644 > --- a/src/mesa/drivers/dri/i965/brw_blorp.h > +++ b/src/mesa/drivers/dri/i965/brw_blorp.h > @@ -197,7 +197,9 @@ struct brw_blorp_wm_push_constants > uint32_t src_z; > > /* Pad out to an integral number of registers */ > - uint32_t pad[5]; > + uint32_t pad; > + > + union gl_color_union clear_color; > }; > > #define BRW_BLORP_NUM_PUSH_CONSTANT_DWORDS \ > diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp > b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp > index 97e3908..314034e 100644 > --- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp > +++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp > @@ -346,6 +346,7 @@ struct brw_blorp_blit_vars { >nir_variable *offset; > } u_x_transform, u_y_transform; > nir_variable *u_src_z; > + nir_variable *u_clear_color; > > /* gl_FragCoord */ > nir_variable *frag_coord; > @@ -374,6 +375,7 @@ brw_blorp_blit_vars_init(nir_builder *b, struct > brw_blorp_blit_vars *v, > LOAD_UNIFORM(y_transform.multiplier, glsl_float_type()) > LOAD_UNIFORM(y_transform.offset, glsl_float_type()) > LOAD_UNIFORM(src_z, glsl_uint_type()) > + LOAD_UNIFORM(clear_color, glsl_vec4_type()) > > #undef DECL_UNIFORM > > @@ -858,7 +860,8 @@ static nir_ssa_def * > blorp_nir_manual_blend_average(nir_builder *b, nir_ssa_def *pos, > unsigned tex_samples, > enum intel_msaa_layout tex_layout, > - enum brw_reg_type dst_type) > + enum brw_reg_type dst_type, > + struct brw_blorp_blit_vars *v) > { > /* If non-null, this is the outer-most if statement */ > nir_if *outer_if = NULL; > @@ -867,9 +870,53 @@ blorp_nir_manual_blend_average(nir_builder *b, > nir_ssa_def *pos, >nir_local_variable_create(b->impl, glsl_vec4_type(), "color"); > > nir_ssa_def *mcs = NULL; > - if (tex_layout == INTEL_MSAA_LAYOUT_CMS) > + if (tex_layout == INTEL_MSAA_LAYOUT_CMS) { >mcs = blorp_nir_txf_ms_mcs(b, pos); > > + /* The MCS buffer stores a packed value that provides a mapping from > + * samples to array slices. The magic value of all ones means that > all > + * samples have the clear color. In this case, we can > short-circuit the > + * sampling process and just use the clear color that we pushed > into the > + * shader. > + */ > + nir_ssa_def *is_clear_color; > + switch (tex_samples) { > + case 2: > + /* Empirical evidence suggests that the value returned from the > + * sampler is not always 0x3 for clear color so we need to mask > it. > + */ > + is_clear_color = > +nir_ieq(b, nir_iand(b, nir_channel(b, mcs, 0), nir_imm_int(b, > 0x3)), > + nir_imm_int(b, 0x3)); > + break; > + case 4: > + is_clear_color = > +nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, 0xff)); > + break; > + case 8: > + is_clear_color = > +nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, ~0)); > + break; > + case 16: > + is_clear_color = > +nir_ior(b, nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, > ~0)), > + nir_ieq(b, nir_channel(b, mcs, 1), nir_imm_int(b, > ~0))); > + break; > + default: > + unreachable("Invalid sample count"); > + } > + > + nir_if *if_stmt = nir_if_create(b->shader); > + if_stmt->condition = nir_src_for_ssa(is_clear_color); > + nir_cf_node_insert(b->cursor, _stmt->cf_node); > + > + b->cursor = nir_after_cf_list(_stmt->then_list); > + nir_store_var(b, color, nir_load_var(b,
[Mesa-dev] [PATCH] i965/blorp: Special-case the clear color in MSAA resolves
The current MSAA resolve code has a special-case for if the MCS value is 0. In this case we can only sample once because we know that all values are in slice 0. This commit adds a second optimization that detecs the magic MCS value that indicates the clear color and grabs the color from a push constant and avoids sampling altogether. On a microbenchmark written by Neil Roberts that tests resolving surfaces with just clear color, this improves performance by 60% for 8x, 40% for 4x, and 28% for 2x MSAA on my SKL gte3 laptop. The benchmark can be found on the ML archive: https://lists.freedesktop.org/archives/mesa-dev/2016-February/108077.html --- src/mesa/drivers/dri/i965/brw_blorp.h| 4 +- src/mesa/drivers/dri/i965/brw_blorp_blit.cpp | 72 ++-- 2 files changed, 71 insertions(+), 5 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_blorp.h b/src/mesa/drivers/dri/i965/brw_blorp.h index 5f7569c..550c6c5 100644 --- a/src/mesa/drivers/dri/i965/brw_blorp.h +++ b/src/mesa/drivers/dri/i965/brw_blorp.h @@ -197,7 +197,9 @@ struct brw_blorp_wm_push_constants uint32_t src_z; /* Pad out to an integral number of registers */ - uint32_t pad[5]; + uint32_t pad; + + union gl_color_union clear_color; }; #define BRW_BLORP_NUM_PUSH_CONSTANT_DWORDS \ diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp index 97e3908..314034e 100644 --- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp +++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp @@ -346,6 +346,7 @@ struct brw_blorp_blit_vars { nir_variable *offset; } u_x_transform, u_y_transform; nir_variable *u_src_z; + nir_variable *u_clear_color; /* gl_FragCoord */ nir_variable *frag_coord; @@ -374,6 +375,7 @@ brw_blorp_blit_vars_init(nir_builder *b, struct brw_blorp_blit_vars *v, LOAD_UNIFORM(y_transform.multiplier, glsl_float_type()) LOAD_UNIFORM(y_transform.offset, glsl_float_type()) LOAD_UNIFORM(src_z, glsl_uint_type()) + LOAD_UNIFORM(clear_color, glsl_vec4_type()) #undef DECL_UNIFORM @@ -858,7 +860,8 @@ static nir_ssa_def * blorp_nir_manual_blend_average(nir_builder *b, nir_ssa_def *pos, unsigned tex_samples, enum intel_msaa_layout tex_layout, - enum brw_reg_type dst_type) + enum brw_reg_type dst_type, + struct brw_blorp_blit_vars *v) { /* If non-null, this is the outer-most if statement */ nir_if *outer_if = NULL; @@ -867,9 +870,53 @@ blorp_nir_manual_blend_average(nir_builder *b, nir_ssa_def *pos, nir_local_variable_create(b->impl, glsl_vec4_type(), "color"); nir_ssa_def *mcs = NULL; - if (tex_layout == INTEL_MSAA_LAYOUT_CMS) + if (tex_layout == INTEL_MSAA_LAYOUT_CMS) { mcs = blorp_nir_txf_ms_mcs(b, pos); + /* The MCS buffer stores a packed value that provides a mapping from + * samples to array slices. The magic value of all ones means that all + * samples have the clear color. In this case, we can short-circuit the + * sampling process and just use the clear color that we pushed into the + * shader. + */ + nir_ssa_def *is_clear_color; + switch (tex_samples) { + case 2: + /* Empirical evidence suggests that the value returned from the + * sampler is not always 0x3 for clear color so we need to mask it. + */ + is_clear_color = +nir_ieq(b, nir_iand(b, nir_channel(b, mcs, 0), nir_imm_int(b, 0x3)), + nir_imm_int(b, 0x3)); + break; + case 4: + is_clear_color = +nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, 0xff)); + break; + case 8: + is_clear_color = +nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, ~0)); + break; + case 16: + is_clear_color = +nir_ior(b, nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, ~0)), + nir_ieq(b, nir_channel(b, mcs, 1), nir_imm_int(b, ~0))); + break; + default: + unreachable("Invalid sample count"); + } + + nir_if *if_stmt = nir_if_create(b->shader); + if_stmt->condition = nir_src_for_ssa(is_clear_color); + nir_cf_node_insert(b->cursor, _stmt->cf_node); + + b->cursor = nir_after_cf_list(_stmt->then_list); + nir_store_var(b, color, nir_load_var(b, v->u_clear_color), 0xf); + + b->cursor = nir_after_cf_list(_stmt->else_list); + outer_if = if_stmt; + } + /* We add together samples using a binary tree structure, e.g. for 4x MSAA: * * result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4 @@ -937,7 +984,8 @@ blorp_nir_manual_blend_average(nir_builder *b, nir_ssa_def *pos, nir_store_var(b, color, texture_data[0], 0xf); b->cursor =