It might be better to just prefetch no samplers in this case? -- a shader that has this many active samplers "probably" doesn't actually use them all in a single invocation.
On Thu, Apr 30, 2015 at 5:23 AM, Kenneth Graunke <[email protected]> wrote: > On Wednesday, April 29, 2015 07:47:26 PM Pohjolainen, Topi wrote: >> On Thu, Apr 23, 2015 at 09:58:22PM +0300, Pohjolainen, Topi wrote: >> > On Thu, Apr 23, 2015 at 11:53:49AM -0700, Matt Turner wrote: >> > > On Wed, Apr 22, 2015 at 1:47 PM, Topi Pohjolainen >> > > <[email protected]> wrote: >> > > > Signed-off-by: Topi Pohjolainen <[email protected]> >> > > > --- >> > > > src/mesa/drivers/dri/i965/brw_state.h | 12 +++++ >> > > > src/mesa/drivers/dri/i965/gen8_ps_state.c | 74 >> > > > ++++++++++++++++++++----------- >> > > > 2 files changed, 59 insertions(+), 27 deletions(-) >> > > > >> > > > diff --git a/src/mesa/drivers/dri/i965/brw_state.h >> > > > b/src/mesa/drivers/dri/i965/brw_state.h >> > > > index 178f039..0c4f65e 100644 >> > > > --- a/src/mesa/drivers/dri/i965/brw_state.h >> > > > +++ b/src/mesa/drivers/dri/i965/brw_state.h >> > > > @@ -265,6 +265,18 @@ void gen7_set_surface_mcs_info(struct brw_context >> > > > *brw, >> > > > void gen7_check_surface_setup(uint32_t *surf, bool is_render_target); >> > > > void gen7_init_vtable_surface_functions(struct brw_context *brw); >> > > > >> > > > +/* gen8_ps_state.c */ >> > > > +void gen8_upload_ps_state(struct brw_context *brw, >> > > > + const struct gl_fragment_program *fp, >> > > > + const struct brw_stage_state *stage_state, >> > > > + const struct brw_wm_prog_data *prog_data, >> > > > + uint32_t fast_clear_op); >> > > > + >> > > > +void gen8_upload_ps_extra(struct brw_context *brw, >> > > > + const struct gl_fragment_program *fp, >> > > > + const struct brw_wm_prog_data *prog_data, >> > > > + bool multisampled_fbo); >> > > > + >> > > > /* gen7_sol_state.c */ >> > > > void gen7_upload_3dstate_so_decl_list(struct brw_context *brw, >> > > > const struct brw_vue_map >> > > > *vue_map); >> > > > diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c >> > > > b/src/mesa/drivers/dri/i965/gen8_ps_state.c >> > > > index 5f39e12..da6136b 100644 >> > > > --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c >> > > > +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c >> > > > @@ -27,15 +27,13 @@ >> > > > #include "brw_defines.h" >> > > > #include "intel_batchbuffer.h" >> > > > >> > > > -static void >> > > > -upload_ps_extra(struct brw_context *brw) >> > > > +void >> > > > +gen8_upload_ps_extra(struct brw_context *brw, >> > > > + const struct gl_fragment_program *fp, >> > > > + const struct brw_wm_prog_data *prog_data, >> > > > + bool multisampled_fbo) >> > > > { >> > > > struct gl_context *ctx = &brw->ctx; >> > > > - /* BRW_NEW_FRAGMENT_PROGRAM */ >> > > > - const struct brw_fragment_program *fp = >> > > > - brw_fragment_program_const(brw->fragment_program); >> > > > - /* BRW_NEW_FS_PROG_DATA */ >> > > > - const struct brw_wm_prog_data *prog_data = brw->wm.prog_data; >> > > > uint32_t dw1 = 0; >> > > > >> > > > dw1 |= GEN8_PSX_PIXEL_SHADER_VALID; >> > > > @@ -47,16 +45,14 @@ upload_ps_extra(struct brw_context *brw) >> > > > if (prog_data->num_varying_inputs != 0) >> > > > dw1 |= GEN8_PSX_ATTRIBUTE_ENABLE; >> > > > >> > > > - if (fp->program.Base.InputsRead & VARYING_BIT_POS) >> > > > + if (fp->Base.InputsRead & VARYING_BIT_POS) >> > > > dw1 |= GEN8_PSX_USES_SOURCE_DEPTH | GEN8_PSX_USES_SOURCE_W; >> > > > >> > > > - /* BRW_NEW_NUM_SAMPLES | _NEW_MULTISAMPLE */ >> > > > - bool multisampled_fbo = brw->num_samples > 1; >> > > > if (multisampled_fbo && >> > > > - _mesa_get_min_invocations_per_fragment(ctx, &fp->program, >> > > > false) > 1) >> > > > + _mesa_get_min_invocations_per_fragment(ctx, fp, false) > 1) >> > > > dw1 |= GEN8_PSX_SHADER_IS_PER_SAMPLE; >> > > > >> > > > - if (fp->program.Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) >> > > > + if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) >> > > > dw1 |= GEN8_PSX_SHADER_USES_INPUT_COVERAGE_MASK; >> > > > >> > > > if (prog_data->uses_omask) >> > > > @@ -68,6 +64,20 @@ upload_ps_extra(struct brw_context *brw) >> > > > ADVANCE_BATCH(); >> > > > } >> > > > >> > > > +static void >> > > > +upload_ps_extra(struct brw_context *brw) >> > > > +{ >> > > > + /* BRW_NEW_FRAGMENT_PROGRAM */ >> > > > + const struct brw_fragment_program *fp = >> > > > + brw_fragment_program_const(brw->fragment_program); >> > > > + /* BRW_NEW_FS_PROG_DATA */ >> > > > + const struct brw_wm_prog_data *prog_data = brw->wm.prog_data; >> > > > + /* BRW_NEW_NUM_SAMPLES | _NEW_MULTISAMPLE */ >> > > > + const bool multisampled_fbo = brw->num_samples > 1; >> > > > + >> > > > + gen8_upload_ps_extra(brw, &fp->program, prog_data, >> > > > multisampled_fbo); >> > > > +} >> > > > + >> > > > const struct brw_tracked_state gen8_ps_extra = { >> > > > .dirty = { >> > > > .mesa = _NEW_MULTISAMPLE, >> > > > @@ -118,23 +128,24 @@ const struct brw_tracked_state gen8_wm_state = { >> > > > .emit = upload_wm_state, >> > > > }; >> > > > >> > > > -static void >> > > > -upload_ps_state(struct brw_context *brw) >> > > > +void >> > > > +gen8_upload_ps_state(struct brw_context *brw, >> > > > + const struct gl_fragment_program *fp, >> > > > + const struct brw_stage_state *stage_state, >> > > > + const struct brw_wm_prog_data *prog_data, >> > > > + uint32_t fast_clear_op) >> > > > { >> > > > struct gl_context *ctx = &brw->ctx; >> > > > uint32_t dw3 = 0, dw6 = 0, dw7 = 0, ksp0, ksp2 = 0; >> > > > >> > > > - /* BRW_NEW_FS_PROG_DATA */ >> > > > - const struct brw_wm_prog_data *prog_data = brw->wm.prog_data; >> > > > - >> > > > /* Initialize the execution mask with VMask. Otherwise, >> > > > derivatives are >> > > > * incorrect for subspans where some of the pixels are unlit. We >> > > > believe >> > > > * the bit just didn't take effect in previous generations. >> > > > */ >> > > > dw3 |= GEN7_PS_VECTOR_MASK_ENABLE; >> > > > >> > > > - dw3 |= >> > > > - (ALIGN(brw->wm.base.sampler_count, 4) / 4) << >> > > > GEN7_PS_SAMPLER_COUNT_SHIFT; >> > > > + dw3 |= (ALIGN(stage_state->sampler_count, 4) / 4) << >> > > > + GEN7_PS_SAMPLER_COUNT_SHIFT; >> > > >> > > Indentation of this line looks a little bit odd. Probably remove a >> > > couple of spaces before GEN7_PS_SAMPLER_COUNT_SHIFT. >> > >> > Good catch, I can also start using SET_FIELD(), thanks! >> >> Only that we now have regression in piglit tests: >> >> max-samplers: >> ../../../../../../src/mesa/drivers/dri/i965/gen8_ps_state.c:148: >> gen8_upload_ps_state: Assertion `(fieldval & ~ >> (((1<<((29)-(27)+1))-1)<<(27))) == 0' failed. >> >> Original just tries to use a value that doesn't fit, but the assertion >> provided by SET_FIELD() now catches this. > > Whoops, that's my bug :) 3DSTATE_PS only allows prefetching up to 16 > samplers - I should have fixed this when expanding to 32 samplers on > Haswell+. The good news is, we're just trashing "reserved" bits in > the field, not other fields. > > Topi and I chatted about this on IRC, and I suggested simply clamping > sampler_count to 16 (since it's just for prefetching anyway). He's > planning to send a patch for that. > > --Ken > > _______________________________________________ > mesa-dev mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/mesa-dev > _______________________________________________ mesa-dev mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/mesa-dev
