Module: Mesa
Branch: main
Commit: c721f751f2593267fdf7eb352621d4392e62205e
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=c721f751f2593267fdf7eb352621d4392e62205e

Author: Timur Kristóf <[email protected]>
Date:   Wed Aug  3 11:53:29 2022 +0200

ac/nir/ngg: Move LDS store of accepted flag into the inner branch.

For primitives which are rejected based on only W and face, this
will reduce the number of executed branches.

Fossil DB stats on Navi 21:

Totals from 60918 (45.16% of 134906) affected shaders:
CodeSize: 160330564 -> 160086644 (-0.15%)
Instrs: 30477385 -> 30477916 (+0.00%); split: -0.00%, +0.00%
Latency: 139802763 -> 139587915 (-0.15%); split: -0.15%, +0.00%
InvThroughput: 21198444 -> 21184261 (-0.07%); split: -0.07%, +0.00%
SClause: 749811 -> 749810 (-0.00%)
Copies: 2701482 -> 2762930 (+2.27%); split: -0.00%, +2.28%

Signed-off-by: Timur Kristóf <[email protected]>
Reviewed-by: Marek Olšák <[email protected]>
Reviewed-by: Qiang Yu <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17870>

---

 src/amd/common/ac_nir.h           |  9 +++++++--
 src/amd/common/ac_nir_cull.c      | 18 ++++++++++--------
 src/amd/common/ac_nir_lower_ngg.c | 34 +++++++++++++++++++---------------
 3 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/src/amd/common/ac_nir.h b/src/amd/common/ac_nir.h
index 9a9b39d0ddc..bee4ca8410a 100644
--- a/src/amd/common/ac_nir.h
+++ b/src/amd/common/ac_nir.h
@@ -55,6 +55,9 @@ typedef unsigned (*ac_nir_map_io_driver_location)(unsigned 
semantic);
 struct nir_builder;
 typedef struct nir_builder nir_builder;
 
+/* Executed by ac_nir_cull when the current primitive is accepted. */
+typedef void (*ac_nir_cull_accepted)(nir_builder *b, void *state);
+
 nir_ssa_def *
 ac_nir_load_arg(nir_builder *b, const struct ac_shader_args *ac_args, struct 
ac_arg arg);
 
@@ -158,10 +161,12 @@ ac_nir_lower_mesh_inputs_to_mem(nir_shader *shader,
                                 unsigned task_payload_entry_bytes,
                                 unsigned task_num_entries);
 
-nir_ssa_def *
+void
 ac_nir_cull_triangle(nir_builder *b,
                      nir_ssa_def *initially_accepted,
-                     nir_ssa_def *pos[3][4]);
+                     nir_ssa_def *pos[3][4],
+                     ac_nir_cull_accepted accept_func,
+                     void *state);
 
 bool
 ac_nir_lower_global_access(nir_shader *shader);
diff --git a/src/amd/common/ac_nir_cull.c b/src/amd/common/ac_nir_cull.c
index e9ed1128662..212d65a422d 100644
--- a/src/amd/common/ac_nir_cull.c
+++ b/src/amd/common/ac_nir_cull.c
@@ -142,10 +142,12 @@ cull_small_primitive(nir_builder *b, nir_ssa_def 
*bbox_min[3], nir_ssa_def *bbox
    return nir_if_phi(b, prim_is_small, prim_is_small_else);
 }
 
-nir_ssa_def *
+void
 ac_nir_cull_triangle(nir_builder *b,
                      nir_ssa_def *initially_accepted,
-                     nir_ssa_def *pos[3][4])
+                     nir_ssa_def *pos[3][4],
+                     ac_nir_cull_accepted accept_func,
+                     void *state)
 {
    position_w_info w_info = {0};
    analyze_position_w(b, pos, &w_info);
@@ -154,8 +156,6 @@ ac_nir_cull_triangle(nir_builder *b,
    accepted = nir_iand(b, accepted, w_info.w_accepted);
    accepted = nir_iand(b, accepted, cull_face(b, pos, &w_info));
 
-   nir_ssa_def *bbox_accepted = NULL;
-
    nir_if *if_accepted = nir_push_if(b, accepted);
    {
       nir_ssa_def *bbox_min[3] = {0}, *bbox_max[3] = {0};
@@ -165,10 +165,12 @@ ac_nir_cull_triangle(nir_builder *b,
       nir_ssa_def *prim_is_small = cull_small_primitive(b, bbox_min, bbox_max);
       nir_ssa_def *prim_invisible = nir_ior(b, prim_outside_view, 
prim_is_small);
 
-      bbox_accepted = nir_iand(b, nir_inot(b, prim_invisible), 
w_info.all_w_positive);
+      accepted = nir_iand(b, nir_inot(b, prim_invisible), 
w_info.all_w_positive);
+      nir_if *if_still_accepted = nir_push_if(b, accepted);
+      {
+         accept_func(b, state);
+      }
+      nir_pop_if(b, if_still_accepted);
    }
    nir_pop_if(b, if_accepted);
-   accepted = nir_iand(b, accepted, nir_if_phi(b, bbox_accepted, accepted));
-
-   return accepted;
 }
diff --git a/src/amd/common/ac_nir_lower_ngg.c 
b/src/amd/common/ac_nir_lower_ngg.c
index addf2e010d7..193a1820530 100644
--- a/src/amd/common/ac_nir_lower_ngg.c
+++ b/src/amd/common/ac_nir_lower_ngg.c
@@ -47,6 +47,8 @@ typedef struct
    nir_variable *gs_accepted_var;
    nir_variable *gs_vtx_indices_vars[3];
 
+   nir_ssa_def *vtx_addr[3];
+
    struct u_vector saved_uniforms;
 
    bool passthrough;
@@ -1113,6 +1115,18 @@ apply_reusable_variables(nir_builder *b, 
lower_ngg_nogs_state *nogs_state)
    u_vector_finish(&nogs_state->saved_uniforms);
 }
 
+static void
+cull_primitive_accepted(nir_builder *b, void *state)
+{
+   lower_ngg_nogs_state *s = (lower_ngg_nogs_state *)state;
+
+   nir_store_var(b, s->gs_accepted_var, nir_imm_true(b), 0x1u);
+
+   /* Store the accepted state to LDS for ES threads */
+   for (unsigned vtx = 0; vtx < 3; ++vtx)
+      nir_store_shared(b, nir_imm_intN_t(b, 1, 8), s->vtx_addr[vtx], .base = 
lds_es_vertex_accepted);
+}
+
 static void
 add_deferred_attribute_culling(nir_builder *b, nir_cf_list 
*original_extracted_cf, lower_ngg_nogs_state *nogs_state)
 {
@@ -1242,34 +1256,24 @@ add_deferred_attribute_culling(nir_builder *b, 
nir_cf_list *original_extracted_c
          for (unsigned vertex = 0; vertex < 3; ++vertex)
             vtx_idx[vertex] = nir_load_var(b, 
nogs_state->gs_vtx_indices_vars[vertex]);
 
-         nir_ssa_def *vtx_addr[3] = {0};
          nir_ssa_def *pos[3][4] = {0};
 
          /* Load W positions of vertices first because the culling code will 
use these first */
          for (unsigned vtx = 0; vtx < 3; ++vtx) {
-            vtx_addr[vtx] = pervertex_lds_addr(b, vtx_idx[vtx], 
pervertex_lds_bytes);
-            pos[vtx][3] = nir_load_shared(b, 1, 32, vtx_addr[vtx], .base = 
lds_es_pos_w);
-            nir_store_var(b, gs_vtxaddr_vars[vtx], vtx_addr[vtx], 0x1u);
+            nogs_state->vtx_addr[vtx] = pervertex_lds_addr(b, vtx_idx[vtx], 
pervertex_lds_bytes);
+            pos[vtx][3] = nir_load_shared(b, 1, 32, nogs_state->vtx_addr[vtx], 
.base = lds_es_pos_w);
+            nir_store_var(b, gs_vtxaddr_vars[vtx], nogs_state->vtx_addr[vtx], 
0x1u);
          }
 
          /* Load the X/W, Y/W positions of vertices */
          for (unsigned vtx = 0; vtx < 3; ++vtx) {
-            nir_ssa_def *xy = nir_load_shared(b, 2, 32, vtx_addr[vtx], .base = 
lds_es_pos_x);
+            nir_ssa_def *xy = nir_load_shared(b, 2, 32, 
nogs_state->vtx_addr[vtx], .base = lds_es_pos_x);
             pos[vtx][0] = nir_channel(b, xy, 0);
             pos[vtx][1] = nir_channel(b, xy, 1);
          }
 
          /* See if the current primitive is accepted */
-         nir_ssa_def *accepted = ac_nir_cull_triangle(b, nir_imm_bool(b, 
true), pos);
-         nir_store_var(b, gs_accepted_var, accepted, 0x1u);
-
-         nir_if *if_gs_accepted = nir_push_if(b, accepted);
-         {
-            /* Store the accepted state to LDS for ES threads */
-            for (unsigned vtx = 0; vtx < 3; ++vtx)
-               nir_store_shared(b, nir_imm_intN_t(b, 0xff, 8), vtx_addr[vtx], 
.base = lds_es_vertex_accepted, .align_mul = 4u);
-         }
-         nir_pop_if(b, if_gs_accepted);
+         ac_nir_cull_triangle(b, nir_imm_bool(b, true), pos, 
cull_primitive_accepted, nogs_state);
       }
       nir_pop_if(b, if_gs_thread);
 

Reply via email to