Module: Mesa Branch: main Commit: 38e8a73e14d0af70b60c1884d30308e19ef9d60f URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=38e8a73e14d0af70b60c1884d30308e19ef9d60f
Author: Pierre-Eric Pelloux-Prayer <[email protected]> Date: Fri Apr 15 15:12:39 2022 +0200 radeonsi: implement GL_GEOMETRY_SHADER_PRIMITIVES_EMITTED_ARB in shaders Statistics only work in non-NGG mode. If screen->use_ngg is true, we can't know if the draw will actually use NGG or not, so this commit switch to a shader based implementation of this counter. To avoid modifying si_query, the shader implementation behaves like the hw one: it uses the same buffer size and offset. The emulation path activation in the shader is controlled by vs_state_bit[31]. Reviewed-by: Marek Olšák <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15861> --- .../drivers/radeonsi/ci/gfx10-navi10-fail.csv | 2 - .../radeonsi/ci/gfx10_3-sienna_cichlid-fail.csv | 2 - .../drivers/radeonsi/ci/gfx9-raven-fail.csv | 1 - src/gallium/drivers/radeonsi/gfx10_shader_ngg.c | 30 ++++++++++ src/gallium/drivers/radeonsi/si_pipe.h | 3 + src/gallium/drivers/radeonsi/si_query.c | 66 ++++++++++++++++++---- src/gallium/drivers/radeonsi/si_query.h | 4 ++ src/gallium/drivers/radeonsi/si_shader.h | 2 + src/gallium/drivers/radeonsi/si_shader_internal.h | 2 + src/gallium/drivers/radeonsi/si_shader_llvm.c | 2 + src/gallium/drivers/radeonsi/si_shader_llvm_gs.c | 52 +++++++++++++++++ src/gallium/drivers/radeonsi/si_state.h | 1 + src/gallium/drivers/radeonsi/si_state_draw.cpp | 27 +++++++-- 13 files changed, 173 insertions(+), 21 deletions(-) diff --git a/src/gallium/drivers/radeonsi/ci/gfx10-navi10-fail.csv b/src/gallium/drivers/radeonsi/ci/gfx10-navi10-fail.csv index 8172d273f68..e6c33aac5d9 100644 --- a/src/gallium/drivers/radeonsi/ci/gfx10-navi10-fail.csv +++ b/src/gallium/drivers/radeonsi/ci/gfx10-navi10-fail.csv @@ -77,7 +77,6 @@ spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-double-fl spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec2-vec2,Fail spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec3-vec3,Fail spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec4-vec4,Fail -spec@arb_pipeline_statistics_query@arb_pipeline_statistics_query-geom,Fail spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex,Fail spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex@'vs_input2[1][0]' on GL_PROGRAM_INPUT,Fail spec@arb_query_buffer_object@coherency,Fail @@ -172,7 +171,6 @@ wgl@wgl-sanity,Fail # glcts failures KHR-GL46.gl_spirv.spirv_glsl_to_spirv_builtin_functions_test,Fail -KHR-GL46.pipeline_statistics_query_tests_ARB.functional_geometry_shader_queries,Fail KHR-GL46.shader_ballot_tests.ShaderBallotFunctionRead,Fail KHR-GL46.shader_ballot_tests.ShaderBallotBitmasks,Fail KHR-GL46.sparse_texture_tests.SparseTextureCommitment,Fail diff --git a/src/gallium/drivers/radeonsi/ci/gfx10_3-sienna_cichlid-fail.csv b/src/gallium/drivers/radeonsi/ci/gfx10_3-sienna_cichlid-fail.csv index d57b7584574..c97dfd00627 100644 --- a/src/gallium/drivers/radeonsi/ci/gfx10_3-sienna_cichlid-fail.csv +++ b/src/gallium/drivers/radeonsi/ci/gfx10_3-sienna_cichlid-fail.csv @@ -80,7 +80,6 @@ spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec2-vec spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec3-vec3,Fail spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec4-vec4,Fail spec@arb_pipeline_statistics_query@arb_pipeline_statistics_query-frag,Fail -spec@arb_pipeline_statistics_query@arb_pipeline_statistics_query-geom,Fail spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex,Fail spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex@'vs_input2[1][0]' on GL_PROGRAM_INPUT,Fail spec@arb_query_buffer_object@coherency,Fail @@ -186,7 +185,6 @@ wgl@wgl-multi-window-single-context,Fail wgl@wgl-sanity,Fail # glcts failures -KHR-GL46.pipeline_statistics_query_tests_ARB.functional_geometry_shader_queries,Fail KHR-GL46.shader_ballot_tests.ShaderBallotFunctionRead,Fail KHR-GL46.sparse_texture2_tests.SparseTexture2Allocation,Fail KHR-GL46.sparse_texture2_tests.SparseTexture2Commitment,Fail diff --git a/src/gallium/drivers/radeonsi/ci/gfx9-raven-fail.csv b/src/gallium/drivers/radeonsi/ci/gfx9-raven-fail.csv index b804aca1765..7cd5f50959e 100644 --- a/src/gallium/drivers/radeonsi/ci/gfx9-raven-fail.csv +++ b/src/gallium/drivers/radeonsi/ci/gfx9-raven-fail.csv @@ -262,7 +262,6 @@ wgl@wgl-sanity,Fail # glcts failures KHR-GL46.gl_spirv.spirv_glsl_to_spirv_builtin_functions_test,Fail -KHR-GL46.pipeline_statistics_query_tests_ARB.functional_geometry_shader_queries,Fail KHR-GL46.packed_pixels.pbo_rectangle.r16_snorm,Fail KHR-GL46.packed_pixels.pbo_rectangle.r8_snorm,Fail KHR-GL46.packed_pixels.pbo_rectangle.rg16_snorm,Fail diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index d17ad48d1b6..d4a96d71087 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -23,6 +23,7 @@ #include "ac_llvm_cull.h" #include "si_pipe.h" +#include "si_query.h" #include "si_shader_internal.h" #include "sid.h" #include "util/u_memory.h" @@ -70,6 +71,14 @@ static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx) LLVMConstInt(ctx->ac.i32, SI_GS_QUERY_BUF, false)); } +static LLVMValueRef ngg_get_emulated_counters_buf(struct si_shader_context *ctx) +{ + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings); + + return ac_build_load_to_sgpr(&ctx->ac, buf_ptr, + LLVMConstInt(ctx->ac.i32, SI_GS_QUERY_EMULATED_COUNTERS_BUF, false)); +} + /** * Return the number of vertices as a constant in \p num_vertices, * and return a more precise value as LLVMValueRef from the function. @@ -2129,6 +2138,27 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx) } ac_build_export_prim(&ctx->ac, &prim); + + tmp = si_unpack_param(ctx, ctx->vs_state_bits, 31, 1); + tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, ""); + ac_build_ifcc(&ctx->ac, tmp, 5229); /* if (GS_PIPELINE_STATS_EMU) */ + ac_build_ifcc(&ctx->ac, LLVMBuildNot(builder, prim.isnull, ""), 5237); + { + LLVMValueRef args[] = { + ctx->ac.i32_1, + ngg_get_emulated_counters_buf(ctx), + LLVMConstInt(ctx->ac.i32, + (si_hw_query_dw_offset(PIPE_STAT_QUERY_GS_PRIMITIVES) + + SI_QUERY_STATS_END_OFFSET_DW) * 4, + false), + ctx->ac.i32_0, /* soffset */ + ctx->ac.i32_0, /* cachepolicy */ + }; + + ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5, 0); + } + ac_build_endif(&ctx->ac, 5237); + ac_build_endif(&ctx->ac, 5229); } ac_build_endif(&ctx->ac, 5140); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 9cf0417721e..27fe1fa2ccd 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1159,6 +1159,8 @@ struct si_context { unsigned last_gs_out_prim; unsigned current_vs_state; unsigned last_vs_state; + bool current_gs_stats_counter_emul; + bool last_gs_stats_counter_emul; enum pipe_prim_type current_rast_prim; /* primitive type after TES, GS */ struct si_small_prim_cull_info last_small_prim_cull_info; @@ -1263,6 +1265,7 @@ struct si_context { int num_occlusion_queries; int num_perfect_occlusion_queries; int num_pipeline_stat_queries; + int num_pipeline_stat_emulated_queries; struct list_head active_queries; unsigned num_cs_dw_queries_suspend; diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c index d9cf2715fc3..efd81af36c8 100644 --- a/src/gallium/drivers/radeonsi/si_query.c +++ b/src/gallium/drivers/radeonsi/si_query.c @@ -730,6 +730,9 @@ static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned query->result_size += 8; /* for the fence + alignment */ query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen); query->index = index; + if (index == PIPE_STAT_QUERY_GS_PRIMITIVES && + sscreen->use_ngg && (sscreen->info.chip_class >= GFX10 && sscreen->info.chip_class <= GFX10_3)) + query->flags |= SI_QUERY_EMULATE_GS_COUNTERS; break; default: assert(0); @@ -836,12 +839,44 @@ static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_h EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type); break; case PIPE_QUERY_PIPELINE_STATISTICS: { - radeon_begin(cs); - radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0)); - radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); - radeon_emit(va); - radeon_emit(va >> 32); - radeon_end(); + if (sctx->screen->use_ngg && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) { + /* The hw GS primitive counter doesn't work when ngg is active. + * So if use_ngg is true, we don't use the hw version but instead + * emulate it in the GS shader. + * The value is written at the same position, so we don't need to + * change anything else. + * If ngg is enabled for the draw, the primitive count is written in + * gfx10_ngg_gs_emit_epilogue. If ngg is disabled, the number of exported + * vertices is stored in gs_emitted_vertices and the number of prim + * is computed based on the output prim type in emit_gs_epilogue. + */ + struct pipe_shader_buffer sbuf; + sbuf.buffer = &buffer->b.b; + sbuf.buffer_offset = query->buffer.results_end; + sbuf.buffer_size = buffer->bo_size; + si_set_internal_shader_buffer(sctx, SI_GS_QUERY_EMULATED_COUNTERS_BUF, &sbuf); + sctx->current_gs_stats_counter_emul = true; + + const uint32_t zero = 0; + radeon_begin(cs); + /* Clear the emulated counter end value. We don't clear start because it's unused. */ + va += (si_hw_query_dw_offset(query->index) + SI_QUERY_STATS_END_OFFSET_DW) * 4; + radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + 1, 0)); + radeon_emit(S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); + radeon_emit(va); + radeon_emit(va >> 32); + radeon_emit(zero); + radeon_end(); + + sctx->num_pipeline_stat_emulated_queries++; + } else { + radeon_begin(cs); + radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); + radeon_emit(va); + radeon_emit(va >> 32); + radeon_end(); + } break; } default: @@ -918,11 +953,22 @@ static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw unsigned sample_size = (query->result_size - 8) / 2; va += sample_size; + radeon_begin(cs); - radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0)); - radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); - radeon_emit(va); - radeon_emit(va >> 32); + if (sctx->screen->use_ngg && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) { + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + + if (--sctx->num_pipeline_stat_emulated_queries == 0) { + si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL); + sctx->current_gs_stats_counter_emul = false; + } + } else { + radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); + radeon_emit(va); + radeon_emit(va >> 32); + } radeon_end(); fence_va = va + sample_size; diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h index b169da16fca..a7be5dae0d0 100644 --- a/src/gallium/drivers/radeonsi/si_query.h +++ b/src/gallium/drivers/radeonsi/si_query.h @@ -170,6 +170,10 @@ enum /* gap */ /* whether begin_query doesn't clear the result */ SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2), + /* whether GS invocations and emitted primitives counters are emulated + * using atomic adds. + */ + SI_QUERY_EMULATE_GS_COUNTERS = (1 << 3), }; struct si_query_hw_ops { diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 77822c8e8ea..36d9fc2075d 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -273,6 +273,8 @@ enum #define C_VS_STATE_LS_OUT_PATCH_SIZE 0xFF0007FF #define S_VS_STATE_LS_OUT_VERTEX_SIZE(x) (((unsigned)(x)&0xFF) << 24) #define C_VS_STATE_LS_OUT_VERTEX_SIZE 0x00FFFFFF +#define S_VS_STATE_GS_PIPELINE_STATS_EMU(x) (((unsigned)(x)&0x1) << 31) +#define C_VS_STATE_GS_PIPELINE_STATS_EMU 0x7FFFFFFF enum { diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 9a7288f21d7..2595045bcee 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -152,6 +152,8 @@ struct si_shader_context { LLVMValueRef gs_ngg_emit; LLVMValueRef gs_ngg_scratch; LLVMValueRef return_value; + + LLVMValueRef gs_emitted_vertices; }; static inline struct si_shader_context *si_shader_context_from_abi(struct ac_shader_abi *abi) diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 7b99d68250a..f83b788eaaf 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -919,6 +919,8 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS); LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage); LLVMSetAlignment(ctx->gs_ngg_emit, 4); + } else { + ctx->gs_emitted_vertices = LLVMConstInt(ctx->ac.i32, 0, false); } } diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index 0292a7f2145..a607f94fb31 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -25,6 +25,7 @@ #include "ac_nir.h" #include "si_pipe.h" #include "si_shader_internal.h" +#include "si_query.h" #include "sid.h" #include "util/u_memory.h" @@ -200,6 +201,14 @@ static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx) return ac_get_arg(&ctx->ac, ctx->args.gs_wave_id); } +static LLVMValueRef ngg_get_emulated_counters_buf(struct si_shader_context *ctx) +{ + LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings); + + return ac_build_load_to_sgpr(&ctx->ac, buf_ptr, + LLVMConstInt(ctx->ac.i32, SI_GS_QUERY_EMULATED_COUNTERS_BUF, false)); +} + static void emit_gs_epilogue(struct si_shader_context *ctx) { if (ctx->shader->key.ge.as_ngg) { @@ -210,6 +219,46 @@ static void emit_gs_epilogue(struct si_shader_context *ctx) if (ctx->screen->info.chip_class >= GFX10) LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, ""); + if (ctx->screen->use_ngg) { + /* Implement PIPE_STAT_QUERY_GS_PRIMITIVES for non-ngg draws because we can't + * use pipeline statistics (they would be correct but when screen->use_ngg, we + * can't know when the query is started if the next draw(s) will use ngg or not). + */ + LLVMValueRef tmp = si_unpack_param(ctx, ctx->vs_state_bits, 31, 1); + tmp = LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, ""); + ac_build_ifcc(&ctx->ac, tmp, 5229); /* if (GS_PIPELINE_STATS_EMU) */ + { + LLVMValueRef prim = ctx->ac.i32_0; + switch (ctx->shader->selector->info.base.gs.output_primitive) { + case SHADER_PRIM_POINTS: + prim = ctx->gs_emitted_vertices; + break; + case SHADER_PRIM_LINE_STRIP: + prim = LLVMBuildSub(ctx->ac.builder, ctx->gs_emitted_vertices, ctx->ac.i32_1, ""); + prim = ac_build_imax(&ctx->ac, prim, ctx->ac.i32_0); + break; + case SHADER_PRIM_TRIANGLE_STRIP: + prim = LLVMBuildSub(ctx->ac.builder, ctx->gs_emitted_vertices, LLVMConstInt(ctx->ac.i32, 2, 0), ""); + prim = ac_build_imax(&ctx->ac, prim, ctx->ac.i32_0); + break; + } + + LLVMValueRef args[] = { + prim, + ngg_get_emulated_counters_buf(ctx), + LLVMConstInt(ctx->ac.i32, + (si_hw_query_dw_offset(PIPE_STAT_QUERY_GS_PRIMITIVES) + + SI_QUERY_STATS_END_OFFSET_DW) * 4, + false), + ctx->ac.i32_0, /* soffset */ + ctx->ac.i32_0, /* cachepolicy */ + }; + + ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5, 0); + } + ac_build_endif(&ctx->ac, 5229); + } + ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, si_get_gs_wave_id(ctx)); if (ctx->screen->info.chip_class >= GFX9) @@ -295,6 +344,9 @@ static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVM if (offset) { ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8), si_get_gs_wave_id(ctx)); + + ctx->gs_emitted_vertices = LLVMBuildAdd(ctx->ac.builder, ctx->gs_emitted_vertices, + ctx->ac.i32_1, "vert"); } if (!use_kill) diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 9046a84631c..b6e83a186c1 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -369,6 +369,7 @@ enum SI_RING_ESGS, /* gfx6-8 */ SI_RING_GSVS, /* gfx6-10 */ + SI_GS_QUERY_EMULATED_COUNTERS_BUF, /* gfx10+ */ SI_NUM_INTERNAL_BINDINGS, diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index c87dbc74961..8bf1bea07a7 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -1229,15 +1229,31 @@ static void si_emit_vs_state(struct si_context *sctx, unsigned index_size) sctx->current_vs_state |= S_VS_STATE_INDEXED(!!index_size); } - if (sctx->current_vs_state != sctx->last_vs_state) { + bool gs_counters_emu = (GFX_VERSION >= GFX10 && GFX_VERSION <= GFX10_3) && HAS_GS; + + if (sctx->current_vs_state != sctx->last_vs_state || + (gs_counters_emu && sctx->current_gs_stats_counter_emul != sctx->last_gs_stats_counter_emul)) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; /* For the API vertex shader (VS_STATE_INDEXED, LS_OUT_*). */ unsigned vs_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG, PIPE_SHADER_VERTEX); + + unsigned vs_state = sctx->current_vs_state; + unsigned gs_state = vs_state; + if (gs_counters_emu) { + /* Remove HS/LS state and apply Add GS-specific state to control + * counters emulation. + */ + gs_state = vs_state & C_VS_STATE_LS_OUT_PATCH_SIZE & C_VS_STATE_LS_OUT_VERTEX_SIZE; + gs_state |= S_VS_STATE_GS_PIPELINE_STATS_EMU(sctx->current_gs_stats_counter_emul); + sctx->last_gs_stats_counter_emul = sctx->current_gs_stats_counter_emul; + } + radeon_begin(cs); radeon_set_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4, - sctx->current_vs_state); + (gs_counters_emu && vs_base == R_00B230_SPI_SHADER_USER_DATA_GS_0) ? + gs_state : vs_state); /* Set CLAMP_VERTEX_COLOR and OUTPRIM in the last stage * before the rasterizer. @@ -1246,14 +1262,13 @@ static void si_emit_vs_state(struct si_context *sctx, unsigned index_size) */ if (GFX_VERSION <= GFX10_3 && vs_base != R_00B130_SPI_SHADER_USER_DATA_VS_0) { radeon_set_sh_reg(R_00B130_SPI_SHADER_USER_DATA_VS_0 + SI_SGPR_VS_STATE_BITS * 4, - sctx->current_vs_state); + vs_state); } /* For NGG: */ - if (GFX_VERSION >= GFX10 && vs_base != R_00B230_SPI_SHADER_USER_DATA_GS_0) { + if (GFX_VERSION >= GFX10 && vs_base != R_00B230_SPI_SHADER_USER_DATA_GS_0) radeon_set_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + SI_SGPR_VS_STATE_BITS * 4, - sctx->current_vs_state); - } + gs_state); radeon_end(); sctx->last_vs_state = sctx->current_vs_state;
