This thread ID uniform will be used to compute the gl_LocalInvocationIndex and gl_LocalInvocationID values.
It is important for this uniform to be added in the last push constant register. fs_visitor::assign_constant_locations is updated to make sure this happens. The reason this is important is that the cross-thread push constant registers are loaded first, and the per-thread push constant registers are loaded after that. (Broadwell adds another push constant upload mechanism which reverses this order, but we are ignoring this for now.) v2: * Add variable in intrinsics lowering pass * Make sure the ID is pushed last in assign_constant_locations, and that we save a spot for the ID in the push constants Signed-off-by: Jordan Justen <[email protected]> --- src/mesa/drivers/dri/i965/brw_compiler.h | 1 + src/mesa/drivers/dri/i965/brw_fs.cpp | 49 +++++++++++++++++++++++++++++--- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h index a8fb486..f8379bc 100644 --- a/src/mesa/drivers/dri/i965/brw_compiler.h +++ b/src/mesa/drivers/dri/i965/brw_compiler.h @@ -433,6 +433,7 @@ struct brw_cs_prog_data { bool uses_barrier; bool uses_num_work_groups; unsigned local_invocation_id_regs; + int thread_local_id_index; struct { /** @{ diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index bb2caa5..82b6781 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -2086,6 +2086,10 @@ fs_visitor::assign_constant_locations() bool contiguous[uniforms]; memset(contiguous, 0, sizeof(contiguous)); + int thread_local_id_index = + (stage == MESA_SHADER_COMPUTE) ? + ((brw_cs_prog_data*)stage_prog_data)->thread_local_id_index : -1; + /* First, we walk through the instructions and do two things: * * 1) Figure out which uniforms are live. @@ -2130,6 +2134,9 @@ fs_visitor::assign_constant_locations() } } + if (thread_local_id_index >= 0 && !is_live[thread_local_id_index]) + thread_local_id_index = -1; + /* Only allow 16 registers (128 uniform components) as push constants. * * Just demote the end of the list. We could probably do better @@ -2158,6 +2165,9 @@ fs_visitor::assign_constant_locations() int chunk_start = -1; + /* We may need to save a slot for the thread ID */ + unsigned int saved_slots = thread_local_id_index >= 0 ? 1 : 0; + /* First push 64-bit uniforms to ensure they are properly aligned */ for (unsigned u = 0; u < uniforms; u++) { if (!is_live[u] || !is_live_64bit[u]) @@ -2166,8 +2176,8 @@ fs_visitor::assign_constant_locations() set_push_pull_constant_loc(u, &chunk_start, contiguous[u], push_constant_loc, pull_constant_loc, &num_push_constants, &num_pull_constants, - max_push_components, max_chunk_size, - stage_prog_data); + max_push_components - saved_slots, + max_chunk_size, stage_prog_data); } @@ -2176,13 +2186,29 @@ fs_visitor::assign_constant_locations() if (!is_live[u] || is_live_64bit[u]) continue; + /* Skip thread_local_id_index to put it in the last push register. */ + if (thread_local_id_index == (int)u) + continue; + + set_push_pull_constant_loc(u, &chunk_start, contiguous[u], + push_constant_loc, pull_constant_loc, + &num_push_constants, &num_pull_constants, + max_push_components - saved_slots, + max_chunk_size, stage_prog_data); + } + + if (thread_local_id_index >= 0) { + /* Add the CS thread ID uniform at the end */ + unsigned u = thread_local_id_index; set_push_pull_constant_loc(u, &chunk_start, contiguous[u], push_constant_loc, pull_constant_loc, &num_push_constants, &num_pull_constants, - max_push_components, max_chunk_size, - stage_prog_data); + max_push_components, + max_chunk_size, stage_prog_data); + assert(push_constant_loc[u] >= 0); } + /* As the uniforms are going to be reordered, take the data from a temporary * copy of the original param[]. */ @@ -2201,6 +2227,7 @@ fs_visitor::assign_constant_locations() * push_constant_loc[i] <= i and we can do it in one smooth loop without * having to make a copy. */ + int new_thread_local_id_index = -1; for (unsigned int i = 0; i < uniforms; i++) { const gl_constant_value *value = param[i]; @@ -2208,9 +2235,15 @@ fs_visitor::assign_constant_locations() stage_prog_data->pull_param[pull_constant_loc[i]] = value; } else if (push_constant_loc[i] != -1) { stage_prog_data->param[push_constant_loc[i]] = value; + if (thread_local_id_index == (int)i) + new_thread_local_id_index = push_constant_loc[i]; } } ralloc_free(param); + + if (stage == MESA_SHADER_COMPUTE) + ((brw_cs_prog_data*)stage_prog_data)->thread_local_id_index = + new_thread_local_id_index; } /** @@ -6185,6 +6218,14 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, shader->info.cs.local_size[0] * shader->info.cs.local_size[1] * shader->info.cs.local_size[2]; + prog_data->thread_local_id_index = -1; + nir_foreach_variable(var, &shader->uniforms) { + if (strcmp(var->name, "cs_thread_local_id") == 0) { + prog_data->thread_local_id_index = var->data.driver_location / 4; + break; + } + } + unsigned max_cs_threads = compiler->devinfo->max_cs_threads; unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads); -- 2.8.1 _______________________________________________ mesa-dev mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-dev
