Module: Mesa
Branch: main
Commit: 6058b7381e56af31282682d0c661971b00b766ad
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=6058b7381e56af31282682d0c661971b00b766ad

Author: Marek Olšák <[email protected]>
Date:   Sun Sep 26 10:17:28 2021 -0400

radeonsi: always use the correct number of vertices in NGG shader code

This is always 3 now, but it will be 2 for lines.

Reviewed-by: Pierre-Eric Pelloux-Prayer <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13048>

---

 src/gallium/drivers/radeonsi/gfx10_shader_ngg.c | 37 +++++++++++++------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c 
b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
index 69f18f9fe20..e5f54de2c78 100644
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -86,8 +86,7 @@ static LLVMValueRef ngg_get_vertices_per_prim(struct 
si_shader_context *ctx, uns
       } else {
          /* We always build up all three indices for the prim export
           * independent of the primitive type. The additional garbage
-          * data shouldn't hurt. This number doesn't matter with
-          * NGG passthrough.
+          * data shouldn't hurt. This is used by exports and streamout.
           */
          *num_vertices = 3;
 
@@ -186,7 +185,7 @@ void gfx10_ngg_build_export_prim(struct si_shader_context 
*ctx, LLVMValueRef use
       else
          prim.edgeflags = ctx->ac.i32_0;
 
-      for (unsigned i = 0; i < 3; ++i)
+      for (unsigned i = 0; i < prim.num_vertices; ++i)
          prim.index[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[i / 2], 
(i & 1) * 16, 16);
 
       if (ctx->shader->selector->info.writes_edgeflag) {
@@ -793,9 +792,12 @@ static void gfx10_build_primitive_accepted(struct 
ac_llvm_context *ac, LLVMValue
    LLVMValueRef gs_accepted = params[0];
    LLVMValueRef *gs_vtxptr = (LLVMValueRef *)params[1];
 
+   unsigned num_vertices;
+   ngg_get_vertices_per_prim(ctx, &num_vertices);
+
    ac_build_ifcc(&ctx->ac, accepted, 0);
    LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_1, gs_accepted);
-   for (unsigned vtx = 0; vtx < 3; vtx++) {
+   for (unsigned vtx = 0; vtx < num_vertices; vtx++) {
       LLVMBuildStore(ctx->ac.builder, ctx->ac.i8_1,
                      si_build_gep_i8(ctx, gs_vtxptr[vtx], 
lds_byte0_accept_flag));
    }
@@ -872,6 +874,9 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi 
*abi)
 
    LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
 
+   unsigned num_vertices;
+   ngg_get_vertices_per_prim(ctx, &num_vertices);
+
    /* The hardware requires that there are no holes between unculled vertices,
     * which means we have to pack ES threads, i.e. reduce the ES thread count
     * and move ES input VGPRs to lower threads. The upside is that varyings
@@ -905,17 +910,16 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi 
*abi)
       /* For the GS fast launch, the VS prolog simply puts the Vertex IDs
        * into these VGPRs.
        */
-      for (unsigned i = 0; i < 3; ++i)
+      for (unsigned i = 0; i < num_vertices; ++i)
          vtxindex[i] = ac_get_arg(&ctx->ac, ctx->args.gs_vtx_offset[i]);
    } else {
-      for (unsigned i = 0; i < 3; ++i)
+      for (unsigned i = 0; i < num_vertices; ++i)
          vtxindex[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[i / 2], (i 
& 1) * 16, 16);
    };
-   LLVMValueRef gs_vtxptr[] = {
-      ngg_nogs_vertex_ptr(ctx, vtxindex[0]),
-      ngg_nogs_vertex_ptr(ctx, vtxindex[1]),
-      ngg_nogs_vertex_ptr(ctx, vtxindex[2]),
-   };
+   LLVMValueRef gs_vtxptr[3];
+   for (unsigned i = 0; i < num_vertices; i++)
+      gs_vtxptr[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
+
    es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
 
    /* Adding these optimization barriers improves the generated code as 
follows. Crazy right?
@@ -943,9 +947,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi 
*abi)
     * - v_mul_u32_u24_e32 v17, 28, v11
     * - v_mul_u32_u24_e32 v18, 28, v10
     */
-   ac_build_optimization_barrier(&ctx->ac, &gs_vtxptr[0], false);
-   ac_build_optimization_barrier(&ctx->ac, &gs_vtxptr[1], false);
-   ac_build_optimization_barrier(&ctx->ac, &gs_vtxptr[2], false);
+   for (unsigned i = 0; i < num_vertices; i++)
+      ac_build_optimization_barrier(&ctx->ac, &gs_vtxptr[i], false);
 
    LLVMValueRef gs_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
 
@@ -954,7 +957,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi 
*abi)
    {
       /* Load positions. */
       LLVMValueRef pos[3][4] = {};
-      for (unsigned vtx = 0; vtx < 3; vtx++) {
+      for (unsigned vtx = 0; vtx < num_vertices; vtx++) {
          for (unsigned chan = 0; chan < 4; chan++) {
             unsigned index;
             if (chan == 0 || chan == 1)
@@ -1155,7 +1158,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi 
*abi)
    ac_build_ifcc(&ctx->ac, LLVMBuildTrunc(builder, gs_accepted, ctx->ac.i1, 
""), 16011);
    {
       struct ac_ngg_prim prim = {};
-      prim.num_vertices = 3;
+      prim.num_vertices = num_vertices;
       prim.isnull = ctx->ac.i1false;
 
       if (ctx->stage == MESA_SHADER_VERTEX)
@@ -1163,7 +1166,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi 
*abi)
       else
          prim.edgeflags = ctx->ac.i32_0;
 
-      for (unsigned vtx = 0; vtx < 3; vtx++) {
+      for (unsigned vtx = 0; vtx < num_vertices; vtx++) {
          prim.index[vtx] = LLVMBuildLoad(
             builder, si_build_gep_i8(ctx, gs_vtxptr[vtx], 
lds_byte1_new_thread_id), "");
          prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], 
ctx->ac.i32, "");

Reply via email to