Module: Mesa Branch: main Commit: c721f751f2593267fdf7eb352621d4392e62205e URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=c721f751f2593267fdf7eb352621d4392e62205e
Author: Timur Kristóf <[email protected]> Date: Wed Aug 3 11:53:29 2022 +0200 ac/nir/ngg: Move LDS store of accepted flag into the inner branch. For primitives which are rejected based on only W and face, this will reduce the number of executed branches. Fossil DB stats on Navi 21: Totals from 60918 (45.16% of 134906) affected shaders: CodeSize: 160330564 -> 160086644 (-0.15%) Instrs: 30477385 -> 30477916 (+0.00%); split: -0.00%, +0.00% Latency: 139802763 -> 139587915 (-0.15%); split: -0.15%, +0.00% InvThroughput: 21198444 -> 21184261 (-0.07%); split: -0.07%, +0.00% SClause: 749811 -> 749810 (-0.00%) Copies: 2701482 -> 2762930 (+2.27%); split: -0.00%, +2.28% Signed-off-by: Timur Kristóf <[email protected]> Reviewed-by: Marek Olšák <[email protected]> Reviewed-by: Qiang Yu <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17870> --- src/amd/common/ac_nir.h | 9 +++++++-- src/amd/common/ac_nir_cull.c | 18 ++++++++++-------- src/amd/common/ac_nir_lower_ngg.c | 34 +++++++++++++++++++--------------- 3 files changed, 36 insertions(+), 25 deletions(-) diff --git a/src/amd/common/ac_nir.h b/src/amd/common/ac_nir.h index 9a9b39d0ddc..bee4ca8410a 100644 --- a/src/amd/common/ac_nir.h +++ b/src/amd/common/ac_nir.h @@ -55,6 +55,9 @@ typedef unsigned (*ac_nir_map_io_driver_location)(unsigned semantic); struct nir_builder; typedef struct nir_builder nir_builder; +/* Executed by ac_nir_cull when the current primitive is accepted. */ +typedef void (*ac_nir_cull_accepted)(nir_builder *b, void *state); + nir_ssa_def * ac_nir_load_arg(nir_builder *b, const struct ac_shader_args *ac_args, struct ac_arg arg); @@ -158,10 +161,12 @@ ac_nir_lower_mesh_inputs_to_mem(nir_shader *shader, unsigned task_payload_entry_bytes, unsigned task_num_entries); -nir_ssa_def * +void ac_nir_cull_triangle(nir_builder *b, nir_ssa_def *initially_accepted, - nir_ssa_def *pos[3][4]); + nir_ssa_def *pos[3][4], + ac_nir_cull_accepted accept_func, + void *state); bool ac_nir_lower_global_access(nir_shader *shader); diff --git a/src/amd/common/ac_nir_cull.c b/src/amd/common/ac_nir_cull.c index e9ed1128662..212d65a422d 100644 --- a/src/amd/common/ac_nir_cull.c +++ b/src/amd/common/ac_nir_cull.c @@ -142,10 +142,12 @@ cull_small_primitive(nir_builder *b, nir_ssa_def *bbox_min[3], nir_ssa_def *bbox return nir_if_phi(b, prim_is_small, prim_is_small_else); } -nir_ssa_def * +void ac_nir_cull_triangle(nir_builder *b, nir_ssa_def *initially_accepted, - nir_ssa_def *pos[3][4]) + nir_ssa_def *pos[3][4], + ac_nir_cull_accepted accept_func, + void *state) { position_w_info w_info = {0}; analyze_position_w(b, pos, &w_info); @@ -154,8 +156,6 @@ ac_nir_cull_triangle(nir_builder *b, accepted = nir_iand(b, accepted, w_info.w_accepted); accepted = nir_iand(b, accepted, cull_face(b, pos, &w_info)); - nir_ssa_def *bbox_accepted = NULL; - nir_if *if_accepted = nir_push_if(b, accepted); { nir_ssa_def *bbox_min[3] = {0}, *bbox_max[3] = {0}; @@ -165,10 +165,12 @@ ac_nir_cull_triangle(nir_builder *b, nir_ssa_def *prim_is_small = cull_small_primitive(b, bbox_min, bbox_max); nir_ssa_def *prim_invisible = nir_ior(b, prim_outside_view, prim_is_small); - bbox_accepted = nir_iand(b, nir_inot(b, prim_invisible), w_info.all_w_positive); + accepted = nir_iand(b, nir_inot(b, prim_invisible), w_info.all_w_positive); + nir_if *if_still_accepted = nir_push_if(b, accepted); + { + accept_func(b, state); + } + nir_pop_if(b, if_still_accepted); } nir_pop_if(b, if_accepted); - accepted = nir_iand(b, accepted, nir_if_phi(b, bbox_accepted, accepted)); - - return accepted; } diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c index addf2e010d7..193a1820530 100644 --- a/src/amd/common/ac_nir_lower_ngg.c +++ b/src/amd/common/ac_nir_lower_ngg.c @@ -47,6 +47,8 @@ typedef struct nir_variable *gs_accepted_var; nir_variable *gs_vtx_indices_vars[3]; + nir_ssa_def *vtx_addr[3]; + struct u_vector saved_uniforms; bool passthrough; @@ -1113,6 +1115,18 @@ apply_reusable_variables(nir_builder *b, lower_ngg_nogs_state *nogs_state) u_vector_finish(&nogs_state->saved_uniforms); } +static void +cull_primitive_accepted(nir_builder *b, void *state) +{ + lower_ngg_nogs_state *s = (lower_ngg_nogs_state *)state; + + nir_store_var(b, s->gs_accepted_var, nir_imm_true(b), 0x1u); + + /* Store the accepted state to LDS for ES threads */ + for (unsigned vtx = 0; vtx < 3; ++vtx) + nir_store_shared(b, nir_imm_intN_t(b, 1, 8), s->vtx_addr[vtx], .base = lds_es_vertex_accepted); +} + static void add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_cf, lower_ngg_nogs_state *nogs_state) { @@ -1242,34 +1256,24 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c for (unsigned vertex = 0; vertex < 3; ++vertex) vtx_idx[vertex] = nir_load_var(b, nogs_state->gs_vtx_indices_vars[vertex]); - nir_ssa_def *vtx_addr[3] = {0}; nir_ssa_def *pos[3][4] = {0}; /* Load W positions of vertices first because the culling code will use these first */ for (unsigned vtx = 0; vtx < 3; ++vtx) { - vtx_addr[vtx] = pervertex_lds_addr(b, vtx_idx[vtx], pervertex_lds_bytes); - pos[vtx][3] = nir_load_shared(b, 1, 32, vtx_addr[vtx], .base = lds_es_pos_w); - nir_store_var(b, gs_vtxaddr_vars[vtx], vtx_addr[vtx], 0x1u); + nogs_state->vtx_addr[vtx] = pervertex_lds_addr(b, vtx_idx[vtx], pervertex_lds_bytes); + pos[vtx][3] = nir_load_shared(b, 1, 32, nogs_state->vtx_addr[vtx], .base = lds_es_pos_w); + nir_store_var(b, gs_vtxaddr_vars[vtx], nogs_state->vtx_addr[vtx], 0x1u); } /* Load the X/W, Y/W positions of vertices */ for (unsigned vtx = 0; vtx < 3; ++vtx) { - nir_ssa_def *xy = nir_load_shared(b, 2, 32, vtx_addr[vtx], .base = lds_es_pos_x); + nir_ssa_def *xy = nir_load_shared(b, 2, 32, nogs_state->vtx_addr[vtx], .base = lds_es_pos_x); pos[vtx][0] = nir_channel(b, xy, 0); pos[vtx][1] = nir_channel(b, xy, 1); } /* See if the current primitive is accepted */ - nir_ssa_def *accepted = ac_nir_cull_triangle(b, nir_imm_bool(b, true), pos); - nir_store_var(b, gs_accepted_var, accepted, 0x1u); - - nir_if *if_gs_accepted = nir_push_if(b, accepted); - { - /* Store the accepted state to LDS for ES threads */ - for (unsigned vtx = 0; vtx < 3; ++vtx) - nir_store_shared(b, nir_imm_intN_t(b, 0xff, 8), vtx_addr[vtx], .base = lds_es_vertex_accepted, .align_mul = 4u); - } - nir_pop_if(b, if_gs_accepted); + ac_nir_cull_triangle(b, nir_imm_bool(b, true), pos, cull_primitive_accepted, nogs_state); } nir_pop_if(b, if_gs_thread);
