Module: Mesa
Branch: main
Commit: 38e8a73e14d0af70b60c1884d30308e19ef9d60f
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=38e8a73e14d0af70b60c1884d30308e19ef9d60f

Author: Pierre-Eric Pelloux-Prayer <[email protected]>
Date:   Fri Apr 15 15:12:39 2022 +0200

radeonsi: implement GL_GEOMETRY_SHADER_PRIMITIVES_EMITTED_ARB in shaders

Statistics only work in non-NGG mode. If screen->use_ngg is true, we can't
know if the draw will actually use NGG or not, so this commit switch
to a shader based implementation of this counter.

To avoid modifying si_query, the shader implementation behaves like the hw
one: it uses the same buffer size and offset.

The emulation path activation in the shader is controlled by vs_state_bit[31].

Reviewed-by: Marek Olšák <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15861>

---

 .../drivers/radeonsi/ci/gfx10-navi10-fail.csv      |  2 -
 .../radeonsi/ci/gfx10_3-sienna_cichlid-fail.csv    |  2 -
 .../drivers/radeonsi/ci/gfx9-raven-fail.csv        |  1 -
 src/gallium/drivers/radeonsi/gfx10_shader_ngg.c    | 30 ++++++++++
 src/gallium/drivers/radeonsi/si_pipe.h             |  3 +
 src/gallium/drivers/radeonsi/si_query.c            | 66 ++++++++++++++++++----
 src/gallium/drivers/radeonsi/si_query.h            |  4 ++
 src/gallium/drivers/radeonsi/si_shader.h           |  2 +
 src/gallium/drivers/radeonsi/si_shader_internal.h  |  2 +
 src/gallium/drivers/radeonsi/si_shader_llvm.c      |  2 +
 src/gallium/drivers/radeonsi/si_shader_llvm_gs.c   | 52 +++++++++++++++++
 src/gallium/drivers/radeonsi/si_state.h            |  1 +
 src/gallium/drivers/radeonsi/si_state_draw.cpp     | 27 +++++++--
 13 files changed, 173 insertions(+), 21 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/ci/gfx10-navi10-fail.csv 
b/src/gallium/drivers/radeonsi/ci/gfx10-navi10-fail.csv
index 8172d273f68..e6c33aac5d9 100644
--- a/src/gallium/drivers/radeonsi/ci/gfx10-navi10-fail.csv
+++ b/src/gallium/drivers/radeonsi/ci/gfx10-navi10-fail.csv
@@ -77,7 +77,6 @@ 
spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-double-fl
 
spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec2-vec2,Fail
 
spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec3-vec3,Fail
 
spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec4-vec4,Fail
-spec@arb_pipeline_statistics_query@arb_pipeline_statistics_query-geom,Fail
 
spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex,Fail
 
spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex@'vs_input2[1][0]'
 on GL_PROGRAM_INPUT,Fail
 spec@arb_query_buffer_object@coherency,Fail
@@ -172,7 +171,6 @@ wgl@wgl-sanity,Fail
 
 # glcts failures
 KHR-GL46.gl_spirv.spirv_glsl_to_spirv_builtin_functions_test,Fail
-KHR-GL46.pipeline_statistics_query_tests_ARB.functional_geometry_shader_queries,Fail
 KHR-GL46.shader_ballot_tests.ShaderBallotFunctionRead,Fail
 KHR-GL46.shader_ballot_tests.ShaderBallotBitmasks,Fail
 KHR-GL46.sparse_texture_tests.SparseTextureCommitment,Fail
diff --git a/src/gallium/drivers/radeonsi/ci/gfx10_3-sienna_cichlid-fail.csv 
b/src/gallium/drivers/radeonsi/ci/gfx10_3-sienna_cichlid-fail.csv
index d57b7584574..c97dfd00627 100644
--- a/src/gallium/drivers/radeonsi/ci/gfx10_3-sienna_cichlid-fail.csv
+++ b/src/gallium/drivers/radeonsi/ci/gfx10_3-sienna_cichlid-fail.csv
@@ -80,7 +80,6 @@ 
spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec2-vec
 
spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec3-vec3,Fail
 
spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec4-vec4,Fail
 spec@arb_pipeline_statistics_query@arb_pipeline_statistics_query-frag,Fail
-spec@arb_pipeline_statistics_query@arb_pipeline_statistics_query-geom,Fail
 
spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex,Fail
 
spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex@'vs_input2[1][0]'
 on GL_PROGRAM_INPUT,Fail
 spec@arb_query_buffer_object@coherency,Fail
@@ -186,7 +185,6 @@ wgl@wgl-multi-window-single-context,Fail
 wgl@wgl-sanity,Fail
 
 # glcts failures
-KHR-GL46.pipeline_statistics_query_tests_ARB.functional_geometry_shader_queries,Fail
 KHR-GL46.shader_ballot_tests.ShaderBallotFunctionRead,Fail
 KHR-GL46.sparse_texture2_tests.SparseTexture2Allocation,Fail
 KHR-GL46.sparse_texture2_tests.SparseTexture2Commitment,Fail
diff --git a/src/gallium/drivers/radeonsi/ci/gfx9-raven-fail.csv 
b/src/gallium/drivers/radeonsi/ci/gfx9-raven-fail.csv
index b804aca1765..7cd5f50959e 100644
--- a/src/gallium/drivers/radeonsi/ci/gfx9-raven-fail.csv
+++ b/src/gallium/drivers/radeonsi/ci/gfx9-raven-fail.csv
@@ -262,7 +262,6 @@ wgl@wgl-sanity,Fail
 
 # glcts failures
 KHR-GL46.gl_spirv.spirv_glsl_to_spirv_builtin_functions_test,Fail
-KHR-GL46.pipeline_statistics_query_tests_ARB.functional_geometry_shader_queries,Fail
 KHR-GL46.packed_pixels.pbo_rectangle.r16_snorm,Fail
 KHR-GL46.packed_pixels.pbo_rectangle.r8_snorm,Fail
 KHR-GL46.packed_pixels.pbo_rectangle.rg16_snorm,Fail
diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c 
b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
index d17ad48d1b6..d4a96d71087 100644
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -23,6 +23,7 @@
 
 #include "ac_llvm_cull.h"
 #include "si_pipe.h"
+#include "si_query.h"
 #include "si_shader_internal.h"
 #include "sid.h"
 #include "util/u_memory.h"
@@ -70,6 +71,14 @@ static LLVMValueRef ngg_get_query_buf(struct 
si_shader_context *ctx)
                                 LLVMConstInt(ctx->ac.i32, SI_GS_QUERY_BUF, 
false));
 }
 
+static LLVMValueRef ngg_get_emulated_counters_buf(struct si_shader_context 
*ctx)
+{
+   LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
+
+   return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
+                                LLVMConstInt(ctx->ac.i32, 
SI_GS_QUERY_EMULATED_COUNTERS_BUF, false));
+}
+
 /**
  * Return the number of vertices as a constant in \p num_vertices,
  * and return a more precise value as LLVMValueRef from the function.
@@ -2129,6 +2138,27 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context 
*ctx)
       }
 
       ac_build_export_prim(&ctx->ac, &prim);
+
+      tmp = si_unpack_param(ctx, ctx->vs_state_bits, 31, 1);
+      tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+      ac_build_ifcc(&ctx->ac, tmp, 5229); /* if (GS_PIPELINE_STATS_EMU) */
+      ac_build_ifcc(&ctx->ac, LLVMBuildNot(builder, prim.isnull, ""), 5237);
+      {
+         LLVMValueRef args[] = {
+            ctx->ac.i32_1,
+            ngg_get_emulated_counters_buf(ctx),
+            LLVMConstInt(ctx->ac.i32,
+                         (si_hw_query_dw_offset(PIPE_STAT_QUERY_GS_PRIMITIVES) 
+
+                             SI_QUERY_STATS_END_OFFSET_DW) * 4,
+                         false),
+            ctx->ac.i32_0,                            /* soffset */
+            ctx->ac.i32_0,                            /* cachepolicy */
+         };
+
+         ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", 
ctx->ac.i32, args, 5, 0);
+      }
+      ac_build_endif(&ctx->ac, 5237);
+      ac_build_endif(&ctx->ac, 5229);
    }
    ac_build_endif(&ctx->ac, 5140);
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 9cf0417721e..27fe1fa2ccd 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1159,6 +1159,8 @@ struct si_context {
    unsigned last_gs_out_prim;
    unsigned current_vs_state;
    unsigned last_vs_state;
+   bool current_gs_stats_counter_emul;
+   bool last_gs_stats_counter_emul;
    enum pipe_prim_type current_rast_prim; /* primitive type after TES, GS */
 
    struct si_small_prim_cull_info last_small_prim_cull_info;
@@ -1263,6 +1265,7 @@ struct si_context {
    int num_occlusion_queries;
    int num_perfect_occlusion_queries;
    int num_pipeline_stat_queries;
+   int num_pipeline_stat_emulated_queries;
    struct list_head active_queries;
    unsigned num_cs_dw_queries_suspend;
 
diff --git a/src/gallium/drivers/radeonsi/si_query.c 
b/src/gallium/drivers/radeonsi/si_query.c
index d9cf2715fc3..efd81af36c8 100644
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -730,6 +730,9 @@ static struct pipe_query *si_query_hw_create(struct 
si_screen *sscreen, unsigned
       query->result_size += 8; /* for the fence + alignment */
       query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
       query->index = index;
+      if (index == PIPE_STAT_QUERY_GS_PRIMITIVES &&
+          sscreen->use_ngg && (sscreen->info.chip_class >= GFX10 && 
sscreen->info.chip_class <= GFX10_3))
+         query->flags |= SI_QUERY_EMULATE_GS_COUNTERS;
       break;
    default:
       assert(0);
@@ -836,12 +839,44 @@ static void si_query_hw_do_emit_start(struct si_context 
*sctx, struct si_query_h
                         EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
       break;
    case PIPE_QUERY_PIPELINE_STATISTICS: {
-      radeon_begin(cs);
-      radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
-      radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
-      radeon_emit(va);
-      radeon_emit(va >> 32);
-      radeon_end();
+      if (sctx->screen->use_ngg && query->flags & 
SI_QUERY_EMULATE_GS_COUNTERS) {
+         /* The hw GS primitive counter doesn't work when ngg is active.
+          * So if use_ngg is true, we don't use the hw version but instead
+          * emulate it in the GS shader.
+          * The value is written at the same position, so we don't need to
+          * change anything else.
+          * If ngg is enabled for the draw, the primitive count is written in
+          * gfx10_ngg_gs_emit_epilogue. If ngg is disabled, the number of 
exported
+          * vertices is stored in gs_emitted_vertices and the number of prim
+          * is computed based on the output prim type in emit_gs_epilogue.
+          */
+         struct pipe_shader_buffer sbuf;
+         sbuf.buffer = &buffer->b.b;
+         sbuf.buffer_offset = query->buffer.results_end;
+         sbuf.buffer_size = buffer->bo_size;
+         si_set_internal_shader_buffer(sctx, 
SI_GS_QUERY_EMULATED_COUNTERS_BUF, &sbuf);
+         sctx->current_gs_stats_counter_emul = true;
+
+         const uint32_t zero = 0;
+         radeon_begin(cs);
+         /* Clear the emulated counter end value. We don't clear start because 
it's unused. */
+         va += (si_hw_query_dw_offset(query->index) + 
SI_QUERY_STATS_END_OFFSET_DW) * 4;
+         radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + 1, 0));
+         radeon_emit(S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | 
S_370_ENGINE_SEL(V_370_PFP));
+         radeon_emit(va);
+         radeon_emit(va >> 32);
+         radeon_emit(zero);
+         radeon_end();
+
+         sctx->num_pipeline_stat_emulated_queries++;
+      } else {
+         radeon_begin(cs);
+         radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
+         radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | 
EVENT_INDEX(2));
+         radeon_emit(va);
+         radeon_emit(va >> 32);
+         radeon_end();
+      }
       break;
    }
    default:
@@ -918,11 +953,22 @@ static void si_query_hw_do_emit_stop(struct si_context 
*sctx, struct si_query_hw
       unsigned sample_size = (query->result_size - 8) / 2;
 
       va += sample_size;
+
       radeon_begin(cs);
-      radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
-      radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
-      radeon_emit(va);
-      radeon_emit(va >> 32);
+      if (sctx->screen->use_ngg && query->flags & 
SI_QUERY_EMULATE_GS_COUNTERS) {
+         radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+         if (--sctx->num_pipeline_stat_emulated_queries == 0) {
+            si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL);
+            sctx->current_gs_stats_counter_emul = false;
+         }
+      } else {
+         radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
+         radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | 
EVENT_INDEX(2));
+         radeon_emit(va);
+         radeon_emit(va >> 32);
+      }
       radeon_end();
 
       fence_va = va + sample_size;
diff --git a/src/gallium/drivers/radeonsi/si_query.h 
b/src/gallium/drivers/radeonsi/si_query.h
index b169da16fca..a7be5dae0d0 100644
--- a/src/gallium/drivers/radeonsi/si_query.h
+++ b/src/gallium/drivers/radeonsi/si_query.h
@@ -170,6 +170,10 @@ enum
    /* gap */
    /* whether begin_query doesn't clear the result */
    SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2),
+   /* whether GS invocations and emitted primitives counters are emulated
+    * using atomic adds.
+    */
+   SI_QUERY_EMULATE_GS_COUNTERS = (1 << 3),
 };
 
 struct si_query_hw_ops {
diff --git a/src/gallium/drivers/radeonsi/si_shader.h 
b/src/gallium/drivers/radeonsi/si_shader.h
index 77822c8e8ea..36d9fc2075d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -273,6 +273,8 @@ enum
 #define C_VS_STATE_LS_OUT_PATCH_SIZE          0xFF0007FF
 #define S_VS_STATE_LS_OUT_VERTEX_SIZE(x)      (((unsigned)(x)&0xFF) << 24)
 #define C_VS_STATE_LS_OUT_VERTEX_SIZE         0x00FFFFFF
+#define S_VS_STATE_GS_PIPELINE_STATS_EMU(x)   (((unsigned)(x)&0x1) << 31)
+#define C_VS_STATE_GS_PIPELINE_STATS_EMU      0x7FFFFFFF
 
 enum
 {
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h 
b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 9a7288f21d7..2595045bcee 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -152,6 +152,8 @@ struct si_shader_context {
    LLVMValueRef gs_ngg_emit;
    LLVMValueRef gs_ngg_scratch;
    LLVMValueRef return_value;
+
+   LLVMValueRef gs_emitted_vertices;
 };
 
 static inline struct si_shader_context *si_shader_context_from_abi(struct 
ac_shader_abi *abi)
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c 
b/src/gallium/drivers/radeonsi/si_shader_llvm.c
index 7b99d68250a..f83b788eaaf 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -919,6 +919,8 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, 
struct si_shader *shad
             ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", 
AC_ADDR_SPACE_LDS);
          LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
          LLVMSetAlignment(ctx->gs_ngg_emit, 4);
+      } else {
+         ctx->gs_emitted_vertices = LLVMConstInt(ctx->ac.i32, 0, false);
       }
    }
 
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c 
b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
index 0292a7f2145..a607f94fb31 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
@@ -25,6 +25,7 @@
 #include "ac_nir.h"
 #include "si_pipe.h"
 #include "si_shader_internal.h"
+#include "si_query.h"
 #include "sid.h"
 #include "util/u_memory.h"
 
@@ -200,6 +201,14 @@ static LLVMValueRef si_get_gs_wave_id(struct 
si_shader_context *ctx)
       return ac_get_arg(&ctx->ac, ctx->args.gs_wave_id);
 }
 
+static LLVMValueRef ngg_get_emulated_counters_buf(struct si_shader_context 
*ctx)
+{
+   LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
+
+   return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
+                                LLVMConstInt(ctx->ac.i32, 
SI_GS_QUERY_EMULATED_COUNTERS_BUF, false));
+}
+
 static void emit_gs_epilogue(struct si_shader_context *ctx)
 {
    if (ctx->shader->key.ge.as_ngg) {
@@ -210,6 +219,46 @@ static void emit_gs_epilogue(struct si_shader_context *ctx)
    if (ctx->screen->info.chip_class >= GFX10)
       LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
 
+   if (ctx->screen->use_ngg) {
+      /* Implement PIPE_STAT_QUERY_GS_PRIMITIVES for non-ngg draws because we 
can't
+       * use pipeline statistics (they would be correct but when 
screen->use_ngg, we
+       * can't know when the query is started if the next draw(s) will use ngg 
or not).
+       */
+      LLVMValueRef tmp = si_unpack_param(ctx, ctx->vs_state_bits, 31, 1);
+      tmp = LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, "");
+      ac_build_ifcc(&ctx->ac, tmp, 5229); /* if (GS_PIPELINE_STATS_EMU) */
+      {
+         LLVMValueRef prim = ctx->ac.i32_0;
+         switch (ctx->shader->selector->info.base.gs.output_primitive) {
+         case SHADER_PRIM_POINTS:
+            prim = ctx->gs_emitted_vertices;
+            break;
+         case SHADER_PRIM_LINE_STRIP:
+            prim = LLVMBuildSub(ctx->ac.builder, ctx->gs_emitted_vertices, 
ctx->ac.i32_1, "");
+            prim = ac_build_imax(&ctx->ac, prim, ctx->ac.i32_0);
+            break;
+         case SHADER_PRIM_TRIANGLE_STRIP:
+            prim = LLVMBuildSub(ctx->ac.builder, ctx->gs_emitted_vertices, 
LLVMConstInt(ctx->ac.i32, 2, 0), "");
+            prim = ac_build_imax(&ctx->ac, prim, ctx->ac.i32_0);
+            break;
+         }
+
+         LLVMValueRef args[] = {
+            prim,
+            ngg_get_emulated_counters_buf(ctx),
+            LLVMConstInt(ctx->ac.i32,
+                         (si_hw_query_dw_offset(PIPE_STAT_QUERY_GS_PRIMITIVES) 
+
+                             SI_QUERY_STATS_END_OFFSET_DW) * 4,
+                         false),
+            ctx->ac.i32_0,                            /* soffset */
+            ctx->ac.i32_0,                            /* cachepolicy */
+         };
+
+         ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", 
ctx->ac.i32, args, 5, 0);
+      }
+      ac_build_endif(&ctx->ac, 5229);
+   }
+
    ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, 
si_get_gs_wave_id(ctx));
 
    if (ctx->screen->info.chip_class >= GFX9)
@@ -295,6 +344,9 @@ static void si_llvm_emit_vertex(struct ac_shader_abi *abi, 
unsigned stream, LLVM
    if (offset) {
       ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | 
(stream << 8),
                        si_get_gs_wave_id(ctx));
+
+      ctx->gs_emitted_vertices = LLVMBuildAdd(ctx->ac.builder, 
ctx->gs_emitted_vertices,
+                                              ctx->ac.i32_1, "vert");
    }
 
    if (!use_kill)
diff --git a/src/gallium/drivers/radeonsi/si_state.h 
b/src/gallium/drivers/radeonsi/si_state.h
index 9046a84631c..b6e83a186c1 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -369,6 +369,7 @@ enum
 
    SI_RING_ESGS,                       /* gfx6-8 */
    SI_RING_GSVS,                       /* gfx6-10 */
+   SI_GS_QUERY_EMULATED_COUNTERS_BUF,  /* gfx10+ */
 
    SI_NUM_INTERNAL_BINDINGS,
 
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp 
b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index c87dbc74961..8bf1bea07a7 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -1229,15 +1229,31 @@ static void si_emit_vs_state(struct si_context *sctx, 
unsigned index_size)
       sctx->current_vs_state |= S_VS_STATE_INDEXED(!!index_size);
    }
 
-   if (sctx->current_vs_state != sctx->last_vs_state) {
+   bool gs_counters_emu = (GFX_VERSION >= GFX10 && GFX_VERSION <= GFX10_3) && 
HAS_GS;
+
+   if (sctx->current_vs_state != sctx->last_vs_state ||
+       (gs_counters_emu && sctx->current_gs_stats_counter_emul != 
sctx->last_gs_stats_counter_emul)) {
       struct radeon_cmdbuf *cs = &sctx->gfx_cs;
 
       /* For the API vertex shader (VS_STATE_INDEXED, LS_OUT_*). */
       unsigned vs_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, 
NGG,
                                                PIPE_SHADER_VERTEX);
+
+      unsigned vs_state = sctx->current_vs_state;
+      unsigned gs_state = vs_state;
+      if (gs_counters_emu) {
+         /* Remove HS/LS state and apply Add GS-specific state to control
+          * counters emulation.
+          */
+         gs_state = vs_state & C_VS_STATE_LS_OUT_PATCH_SIZE & 
C_VS_STATE_LS_OUT_VERTEX_SIZE;
+         gs_state |= 
S_VS_STATE_GS_PIPELINE_STATS_EMU(sctx->current_gs_stats_counter_emul);
+         sctx->last_gs_stats_counter_emul = 
sctx->current_gs_stats_counter_emul;
+      }
+
       radeon_begin(cs);
       radeon_set_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4,
-                        sctx->current_vs_state);
+                        (gs_counters_emu && vs_base == 
R_00B230_SPI_SHADER_USER_DATA_GS_0) ?
+                           gs_state : vs_state);
 
       /* Set CLAMP_VERTEX_COLOR and OUTPRIM in the last stage
        * before the rasterizer.
@@ -1246,14 +1262,13 @@ static void si_emit_vs_state(struct si_context *sctx, 
unsigned index_size)
        */
       if (GFX_VERSION <= GFX10_3 && vs_base != 
R_00B130_SPI_SHADER_USER_DATA_VS_0) {
          radeon_set_sh_reg(R_00B130_SPI_SHADER_USER_DATA_VS_0 + 
SI_SGPR_VS_STATE_BITS * 4,
-                           sctx->current_vs_state);
+                           vs_state);
       }
 
       /* For NGG: */
-      if (GFX_VERSION >= GFX10 && vs_base != 
R_00B230_SPI_SHADER_USER_DATA_GS_0) {
+      if (GFX_VERSION >= GFX10 && vs_base != 
R_00B230_SPI_SHADER_USER_DATA_GS_0)
          radeon_set_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + 
SI_SGPR_VS_STATE_BITS * 4,
-                           sctx->current_vs_state);
-      }
+                           gs_state);
       radeon_end();
 
       sctx->last_vs_state = sctx->current_vs_state;

Reply via email to