Jordan Justen <jordan.l.jus...@intel.com> writes: > Signed-off-by: Jordan Justen <jordan.l.jus...@intel.com> > > --- > src/mesa/drivers/dri/i965/brw_cs.cpp | 54 +++++++++++++++++++++++++++++++++--- > 1 file changed, 50 insertions(+), 4 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_cs.cpp > b/src/mesa/drivers/dri/i965/brw_cs.cpp > index 541151a..9e51130 100644 > --- a/src/mesa/drivers/dri/i965/brw_cs.cpp > +++ b/src/mesa/drivers/dri/i965/brw_cs.cpp > @@ -311,6 +311,7 @@ brw_upload_cs_state(struct brw_context *brw) > uint32_t offset; > uint32_t *desc = (uint32_t*) brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, > 8 * 4, 64, &offset); > + struct gl_program *prog = (struct gl_program *) brw->compute_program; > struct brw_stage_state *stage_state = &brw->cs.base; > struct brw_cs_prog_data *cs_prog_data = brw->cs.prog_data; > struct brw_stage_prog_data *prog_data = &cs_prog_data->base; > @@ -327,8 +328,16 @@ brw_upload_cs_state(struct brw_context *brw) > > prog_data->binding_table.size_bytes, > 32, > &stage_state->bind_bo_offset); > > + unsigned local_id_dwords = 0; > + > + if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID) { > + local_id_dwords = > + brw_cs_prog_local_id_payload_size(prog, cs_prog_data->simd_size); > + local_id_dwords = ALIGN(local_id_dwords, 32) / sizeof(uint32_t);
We don't need the ALIGN() here - brw_cs_prog_local_id_payload_size() is a multiple of dispatch_width (8, 16, or 32) and sizeof(uint32_t), so it's always at least 32 byte aligned. In fact, it looks like all uses of brw_cs_prog_local_id_payload_size() want number of dwords, not number of bytes, so maybe make that change? > + } > + > unsigned push_constant_data_size = > - prog_data->nr_params * sizeof(gl_constant_value); > + (prog_data->nr_params + local_id_dwords) * sizeof(gl_constant_value); > unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32); > unsigned push_constant_regs = reg_aligned_constant_size / 32; > unsigned threads = get_cs_thread_count(cs_prog_data); > @@ -451,6 +460,9 @@ const struct brw_tracked_state brw_cs_state = { > * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6 > * registers worth of push constant space. > * > + * Note: Any updates to brw_cs_prog_local_id_payload_size or > + * fill_local_id_payload need to coordinated. > + * > * FINISHME: There are a few easy optimizations to consider. > * > * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is > @@ -472,6 +484,27 @@ brw_cs_prog_local_id_payload_size(const struct > gl_program *prog, > } > > > +static void > +fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data, > + void *buffer, unsigned &x, unsigned &y, unsigned &z) > +{ Avoid using C++ references here if we're planning to move it to a C file? > + uint32_t *param = (uint32_t *)buffer; > + for (unsigned i = 0; i < cs_prog_data->simd_size; i++) { > + param[0 * cs_prog_data->simd_size + i] = x; > + param[1 * cs_prog_data->simd_size + i] = y; > + param[2 * cs_prog_data->simd_size + i] = z; > + > + x = (x + 1) % cs_prog_data->local_size[0]; I think I'd prefer something like: x++; if (x == cs_prog_data->local_size[0]) { x = 0; y++; if (y == cs_prog_data->local_size[1]) { ... } } which avoids the modulo per entry and looks a little more straightforward. With those changes, Reviewed-by: Kristian Høgsberg <k...@bitplanet.net> > + if (x == 0) { > + y = (y + 1) % cs_prog_data->local_size[1]; > + if (y == 0) { > + z = (z + 1) % cs_prog_data->local_size[2]; > + } > + } > + } > +} > + > + > /** > * Creates a region containing the push constants for the CS on gen7+. > * > @@ -492,6 +525,13 @@ brw_upload_cs_push_constants(struct brw_context *brw, > struct gl_context *ctx = &brw->ctx; > const struct brw_stage_prog_data *prog_data = > (brw_stage_prog_data*) cs_prog_data; > + unsigned local_id_dwords = 0; > + > + if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID) { > + local_id_dwords = > + brw_cs_prog_local_id_payload_size(prog, cs_prog_data->simd_size); > + local_id_dwords = ALIGN(local_id_dwords, 32) / sizeof(uint32_t); > + } > > /* Updates the ParamaterValues[i] pointers for all parameters of the > * basic type of PROGRAM_STATE_VAR. > @@ -499,14 +539,14 @@ brw_upload_cs_push_constants(struct brw_context *brw, > /* XXX: Should this happen somewhere before to get our state flag set? */ > _mesa_load_state_parameters(ctx, prog->Parameters); > > - if (prog_data->nr_params == 0) { > + if (prog_data->nr_params == 0 && local_id_dwords == 0) { > stage_state->push_const_size = 0; > } else { > gl_constant_value *param; > unsigned i, t; > > const unsigned push_constant_data_size = > - prog_data->nr_params * sizeof(gl_constant_value); > + (local_id_dwords + prog_data->nr_params) * > sizeof(gl_constant_value); > const unsigned reg_aligned_constant_size = > ALIGN(push_constant_data_size, 32); > const unsigned param_aligned_count = > reg_aligned_constant_size / sizeof(*param); > @@ -522,9 +562,15 @@ brw_upload_cs_push_constants(struct brw_context *brw, > STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float)); > > /* _NEW_PROGRAM_CONSTANTS */ > + unsigned x = 0, y = 0, z = 0; > for (t = 0; t < threads; t++) { > + gl_constant_value *next_param = ¶m[t * param_aligned_count]; > + if (local_id_dwords > 0) { > + fill_local_id_payload(cs_prog_data, (void*)next_param, x, y, z); > + next_param += local_id_dwords; > + } > for (i = 0; i < prog_data->nr_params; i++) { > - param[t * param_aligned_count + i] = *prog_data->param[i]; > + next_param[i] = *prog_data->param[i]; > } > } > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev