Module: Mesa Branch: main Commit: 6959493f8c2a0542d13312069659c3c3e233206e URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=6959493f8c2a0542d13312069659c3c3e233206e
Author: Marek Olšák <[email protected]> Date: Thu Jun 8 00:12:39 2023 -0400 radeonsi: move the only tcs_out_lds_offsets field to vs_state_bits This removes 1 user data SGPR. Acked-by: Pierre-Eric Pelloux-Prayer <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23517> --- src/gallium/drivers/radeonsi/si_pipe.h | 1 - src/gallium/drivers/radeonsi/si_shader.c | 2 -- src/gallium/drivers/radeonsi/si_shader.h | 14 ++++++++++---- src/gallium/drivers/radeonsi/si_shader_internal.h | 9 --------- src/gallium/drivers/radeonsi/si_shader_llvm_tess.c | 5 +---- src/gallium/drivers/radeonsi/si_state_draw.cpp | 12 ++++++------ 6 files changed, 17 insertions(+), 26 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 20860ec4c90..74eb42da4bc 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1183,7 +1183,6 @@ struct si_context { unsigned last_tes_sh_base; bool last_tess_uses_primid; unsigned num_patches_per_workgroup; - unsigned tcs_out_offsets; unsigned tcs_offchip_layout; unsigned tes_offchip_ring_va_sgpr; unsigned ls_hs_rsrc2; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 2b7b00de1c0..e6d234ede6c 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -412,7 +412,6 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args) declare_global_desc_pointers(args); declare_per_stage_desc_pointers(args, shader, true); ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout); - ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_out_lds_offsets); ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr); ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits); ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset); @@ -463,7 +462,6 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args) ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id); ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance); ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout); - ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_out_lds_offsets); ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr); if (stage == MESA_SHADER_VERTEX) declare_vb_descriptor_input_sgprs(args, shader); diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 31a659301dd..6235f9f700a 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -169,14 +169,12 @@ enum /* GFX6-8: TCS only */ GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS, - GFX6_SGPR_TCS_OUT_OFFSETS, GFX6_SGPR_TCS_OFFCHIP_ADDR, GFX6_SGPR_TCS_IN_LAYOUT, GFX6_TCS_NUM_USER_SGPR, /* GFX9: Merged LS-HS (VS-TCS) only. */ GFX9_SGPR_TCS_OFFCHIP_LAYOUT = SI_VS_NUM_USER_SGPR, - GFX9_SGPR_TCS_OUT_OFFSETS, GFX9_SGPR_TCS_OFFCHIP_ADDR, GFX9_TCS_NUM_USER_SGPR, @@ -238,8 +236,16 @@ enum * in the shader via vs_state_bits in LS/HS. */ /* bit gap */ -#define VS_STATE_LS_OUT_VERTEX_SIZE__SHIFT 24 -#define VS_STATE_LS_OUT_VERTEX_SIZE__MASK 0xff /* max 32 * 4 + 1 (to reduce LDS bank conflicts) */ +/* TCS output patch0 offset for per-patch outputs / 4 + * - 64 outputs are implied by SI_UNIQUE_SLOT_* values. + * - max = 32(CPs) * 64(outputs) * 16(vec4) * 64(num_patches) * 2(inputs + outputs) / 4 + * = 1M, clamped to 32K(LDS limit) / 4 = 8K + * - only used by si_llvm_tcs_build_end, it can be removed after NIR lowering replaces it + */ +#define VS_STATE_TCS_OUT_PATCH0_OFFSET__SHIFT 10 +#define VS_STATE_TCS_OUT_PATCH0_OFFSET__MASK 0x3fff +#define VS_STATE_LS_OUT_VERTEX_SIZE__SHIFT 24 +#define VS_STATE_LS_OUT_VERTEX_SIZE__MASK 0xff /* max 32 * 4 + 1 (to reduce LDS bank conflicts) */ /* These fields are only set in current_gs_state in si_context, and they are accessible * in the shader via vs_state_bits in legacy GS, the GS copy shader, and any NGG shader. diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 8e8166da653..44067da3f48 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -52,15 +52,6 @@ struct si_shader_args { */ struct ac_arg tcs_offchip_layout; - /* API TCS */ - /* Offsets where TCS outputs and TCS patch outputs live in LDS (<= 16K): - * [16:31] = TCS output patch0 offset for per-patch / 4, - * 64 outputs are implied by SI_UNIQUE_SLOT_* values. - * max = 32(CPs) * 64(outputs) * 16(vec4) * 64(num_patches) * 2(inputs + outputs) / 4 - * = 1M, clamped to 32K(LDS limit) / 4 = 8K - */ - struct ac_arg tcs_out_lds_offsets; - /* API TCS & TES */ struct ac_arg tes_offchip_addr; /* PS */ diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c index 626dabd3a9a..60df62bd0d0 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c @@ -65,7 +65,7 @@ static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx) static LLVMValueRef get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx) { - return si_unpack_param(ctx, ctx->args->tcs_out_lds_offsets, 16, 16); + return si_unpack_param(ctx, ctx->args->vs_state_bits, 10, 14); } static LLVMValueRef get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx) @@ -505,7 +505,6 @@ static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx) ret = si_insert_input_ret(ctx, ret, ctx->args->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS); ret = si_insert_input_ret(ctx, ret, ctx->args->tcs_offchip_layout, 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT); - ret = si_insert_input_ret(ctx, ret, ctx->args->tcs_out_lds_offsets, 8 + GFX9_SGPR_TCS_OUT_OFFSETS); ret = si_insert_input_ret(ctx, ret, ctx->args->tes_offchip_addr, 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR); unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR; @@ -573,7 +572,6 @@ void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_par ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args->tcs_offchip_layout); - ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args->tes_offchip_addr); } else { ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); @@ -581,7 +579,6 @@ void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_par ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args->tcs_offchip_layout); - ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args->tes_offchip_addr); ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args->ac.tess_offchip_offset); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 5f388510331..1401e3dd946 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -797,7 +797,6 @@ void si_update_tess_io_layout_state(struct si_context *sctx) assert((ring_va & u_bit_consecutive(0, 19)) == 0); sctx->tes_offchip_ring_va_sgpr = ring_va; - sctx->tcs_out_offsets = ((perpatch_output_offset / 4) << 16); sctx->tcs_offchip_layout = (num_patches - 1) | ((num_tcs_output_cp - 1) << 6) | ((num_tcs_input_cp - 1) << 11) | ((pervertex_output_patch_size * num_patches) << 16); @@ -815,6 +814,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx) /* Set SI_SGPR_VS_STATE_BITS. */ SET_FIELD(sctx->current_vs_state, VS_STATE_LS_OUT_VERTEX_SIZE, input_vertex_size / 4); + SET_FIELD(sctx->current_vs_state, VS_STATE_TCS_OUT_PATCH0_OFFSET, perpatch_output_offset / 4); /* We should be able to support in-shader LDS use with LLVM >= 9 * by just adding the lds_sizes together, but it has never @@ -859,9 +859,8 @@ static void si_emit_tess_io_layout_state(struct si_context *sctx) /* Set userdata SGPRs for merged LS-HS. */ radeon_set_sh_reg_seq( - R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, 3); + R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, 2); radeon_emit(sctx->tcs_offchip_layout); - radeon_emit(sctx->tcs_out_offsets); radeon_emit(sctx->tes_offchip_ring_va_sgpr); } else { /* Due to a hw bug, RSRC2_LS must be written twice with another @@ -874,9 +873,8 @@ static void si_emit_tess_io_layout_state(struct si_context *sctx) /* Set userdata SGPRs for TCS. */ radeon_set_sh_reg_seq( - R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4); + R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, 3); radeon_emit(sctx->tcs_offchip_layout); - radeon_emit(sctx->tcs_out_offsets); radeon_emit(sctx->tes_offchip_ring_va_sgpr); radeon_emit(sctx->current_vs_state); } @@ -1214,7 +1212,9 @@ static void si_emit_vs_state(struct si_context *sctx, unsigned index_size) vs_state |= ENCODE_FIELD(VS_STATE_INDEXED, 1); /* Copy all state bits from vs_state to gs_state except the LS bits. */ - gs_state |= vs_state & CLEAR_FIELD(VS_STATE_LS_OUT_VERTEX_SIZE); + gs_state |= vs_state & + CLEAR_FIELD(VS_STATE_TCS_OUT_PATCH0_OFFSET) & + CLEAR_FIELD(VS_STATE_LS_OUT_VERTEX_SIZE); if (vs_state != sctx->last_vs_state || ((HAS_GS || NGG) && gs_state != sctx->last_gs_state)) {
