Module: Mesa
Branch: main
Commit: 6959493f8c2a0542d13312069659c3c3e233206e
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=6959493f8c2a0542d13312069659c3c3e233206e

Author: Marek Olšák <[email protected]>
Date:   Thu Jun  8 00:12:39 2023 -0400

radeonsi: move the only tcs_out_lds_offsets field to vs_state_bits

This removes 1 user data SGPR.

Acked-by: Pierre-Eric Pelloux-Prayer <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23517>

---

 src/gallium/drivers/radeonsi/si_pipe.h             |  1 -
 src/gallium/drivers/radeonsi/si_shader.c           |  2 --
 src/gallium/drivers/radeonsi/si_shader.h           | 14 ++++++++++----
 src/gallium/drivers/radeonsi/si_shader_internal.h  |  9 ---------
 src/gallium/drivers/radeonsi/si_shader_llvm_tess.c |  5 +----
 src/gallium/drivers/radeonsi/si_state_draw.cpp     | 12 ++++++------
 6 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 20860ec4c90..74eb42da4bc 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1183,7 +1183,6 @@ struct si_context {
    unsigned last_tes_sh_base;
    bool last_tess_uses_primid;
    unsigned num_patches_per_workgroup;
-   unsigned tcs_out_offsets;
    unsigned tcs_offchip_layout;
    unsigned tes_offchip_ring_va_sgpr;
    unsigned ls_hs_rsrc2;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 2b7b00de1c0..e6d234ede6c 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -412,7 +412,6 @@ void si_init_shader_args(struct si_shader *shader, struct 
si_shader_args *args)
       declare_global_desc_pointers(args);
       declare_per_stage_desc_pointers(args, shader, true);
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, 
&args->tcs_offchip_layout);
-      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, 
&args->tcs_out_lds_offsets);
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, 
&args->tes_offchip_addr);
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, 
&args->ac.tess_offchip_offset);
@@ -463,7 +462,6 @@ void si_init_shader_args(struct si_shader *shader, struct 
si_shader_args *args)
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, 
&args->ac.start_instance);
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, 
&args->tcs_offchip_layout);
-      ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, 
&args->tcs_out_lds_offsets);
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, 
&args->tes_offchip_addr);
       if (stage == MESA_SHADER_VERTEX)
          declare_vb_descriptor_input_sgprs(args, shader);
diff --git a/src/gallium/drivers/radeonsi/si_shader.h 
b/src/gallium/drivers/radeonsi/si_shader.h
index 31a659301dd..6235f9f700a 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -169,14 +169,12 @@ enum
 
    /* GFX6-8: TCS only */
    GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
-   GFX6_SGPR_TCS_OUT_OFFSETS,
    GFX6_SGPR_TCS_OFFCHIP_ADDR,
    GFX6_SGPR_TCS_IN_LAYOUT,
    GFX6_TCS_NUM_USER_SGPR,
 
    /* GFX9: Merged LS-HS (VS-TCS) only. */
    GFX9_SGPR_TCS_OFFCHIP_LAYOUT = SI_VS_NUM_USER_SGPR,
-   GFX9_SGPR_TCS_OUT_OFFSETS,
    GFX9_SGPR_TCS_OFFCHIP_ADDR,
    GFX9_TCS_NUM_USER_SGPR,
 
@@ -238,8 +236,16 @@ enum
  * in the shader via vs_state_bits in LS/HS.
  */
 /* bit gap */
-#define VS_STATE_LS_OUT_VERTEX_SIZE__SHIFT   24
-#define VS_STATE_LS_OUT_VERTEX_SIZE__MASK    0xff /* max 32 * 4 + 1 (to reduce 
LDS bank conflicts) */
+/* TCS output patch0 offset for per-patch outputs / 4
+ * - 64 outputs are implied by SI_UNIQUE_SLOT_* values.
+ * - max = 32(CPs) * 64(outputs) * 16(vec4) * 64(num_patches) * 2(inputs + 
outputs) / 4
+ *       = 1M, clamped to 32K(LDS limit) / 4 = 8K
+ * - only used by si_llvm_tcs_build_end, it can be removed after NIR lowering 
replaces it
+ */
+#define VS_STATE_TCS_OUT_PATCH0_OFFSET__SHIFT   10
+#define VS_STATE_TCS_OUT_PATCH0_OFFSET__MASK    0x3fff
+#define VS_STATE_LS_OUT_VERTEX_SIZE__SHIFT      24
+#define VS_STATE_LS_OUT_VERTEX_SIZE__MASK       0xff /* max 32 * 4 + 1 (to 
reduce LDS bank conflicts) */
 
 /* These fields are only set in current_gs_state in si_context, and they are 
accessible
  * in the shader via vs_state_bits in legacy GS, the GS copy shader, and any 
NGG shader.
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h 
b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 8e8166da653..44067da3f48 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -52,15 +52,6 @@ struct si_shader_args {
     */
    struct ac_arg tcs_offchip_layout;
 
-   /* API TCS */
-   /* Offsets where TCS outputs and TCS patch outputs live in LDS (<= 16K):
-    *   [16:31] = TCS output patch0 offset for per-patch / 4,
-    *       64 outputs are implied by SI_UNIQUE_SLOT_* values.
-    *       max = 32(CPs) * 64(outputs) * 16(vec4) * 64(num_patches) * 
2(inputs + outputs) / 4
-    *           = 1M, clamped to 32K(LDS limit) / 4 = 8K
-    */
-   struct ac_arg tcs_out_lds_offsets;
-
    /* API TCS & TES */
    struct ac_arg tes_offchip_addr;
    /* PS */
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c 
b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
index 626dabd3a9a..60df62bd0d0 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
@@ -65,7 +65,7 @@ static LLVMValueRef get_tcs_out_patch_stride(struct 
si_shader_context *ctx)
 
 static LLVMValueRef get_tcs_out_patch0_patch_data_offset(struct 
si_shader_context *ctx)
 {
-   return si_unpack_param(ctx, ctx->args->tcs_out_lds_offsets, 16, 16);
+   return si_unpack_param(ctx, ctx->args->vs_state_bits, 10, 14);
 }
 
 static LLVMValueRef get_tcs_out_current_patch_data_offset(struct 
si_shader_context *ctx)
@@ -505,7 +505,6 @@ static void si_set_ls_return_value_for_tcs(struct 
si_shader_context *ctx)
    ret = si_insert_input_ret(ctx, ret, ctx->args->vs_state_bits, 8 + 
SI_SGPR_VS_STATE_BITS);
 
    ret = si_insert_input_ret(ctx, ret, ctx->args->tcs_offchip_layout, 8 + 
GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
-   ret = si_insert_input_ret(ctx, ret, ctx->args->tcs_out_lds_offsets, 8 + 
GFX9_SGPR_TCS_OUT_OFFSETS);
    ret = si_insert_input_ret(ctx, ret, ctx->args->tes_offchip_addr, 8 + 
GFX9_SGPR_TCS_OFFCHIP_ADDR);
 
    unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
@@ -573,7 +572,6 @@ void si_llvm_build_tcs_epilog(struct si_shader_context 
*ctx, union si_shader_par
       ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
       ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
       ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, 
&ctx->args->tcs_offchip_layout);
-      ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
       ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, 
&ctx->args->tes_offchip_addr);
    } else {
       ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
@@ -581,7 +579,6 @@ void si_llvm_build_tcs_epilog(struct si_shader_context 
*ctx, union si_shader_par
       ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
       ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
       ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, 
&ctx->args->tcs_offchip_layout);
-      ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
       ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, 
&ctx->args->tes_offchip_addr);
       ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
       ac_add_arg(&ctx->args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, 
&ctx->args->ac.tess_offchip_offset);
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp 
b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index 5f388510331..1401e3dd946 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -797,7 +797,6 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
    assert((ring_va & u_bit_consecutive(0, 19)) == 0);
 
    sctx->tes_offchip_ring_va_sgpr = ring_va;
-   sctx->tcs_out_offsets = ((perpatch_output_offset / 4) << 16);
    sctx->tcs_offchip_layout =
       (num_patches - 1) | ((num_tcs_output_cp - 1) << 6) | ((num_tcs_input_cp 
- 1) << 11) |
       ((pervertex_output_patch_size * num_patches) << 16);
@@ -815,6 +814,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
 
    /* Set SI_SGPR_VS_STATE_BITS. */
    SET_FIELD(sctx->current_vs_state, VS_STATE_LS_OUT_VERTEX_SIZE, 
input_vertex_size / 4);
+   SET_FIELD(sctx->current_vs_state, VS_STATE_TCS_OUT_PATCH0_OFFSET, 
perpatch_output_offset / 4);
 
    /* We should be able to support in-shader LDS use with LLVM >= 9
     * by just adding the lds_sizes together, but it has never
@@ -859,9 +859,8 @@ static void si_emit_tess_io_layout_state(struct si_context 
*sctx)
 
       /* Set userdata SGPRs for merged LS-HS. */
       radeon_set_sh_reg_seq(
-         R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 
4, 3);
+         R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 
4, 2);
       radeon_emit(sctx->tcs_offchip_layout);
-      radeon_emit(sctx->tcs_out_offsets);
       radeon_emit(sctx->tes_offchip_ring_va_sgpr);
    } else {
       /* Due to a hw bug, RSRC2_LS must be written twice with another
@@ -874,9 +873,8 @@ static void si_emit_tess_io_layout_state(struct si_context 
*sctx)
 
       /* Set userdata SGPRs for TCS. */
       radeon_set_sh_reg_seq(
-         R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 
4, 4);
+         R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 
4, 3);
       radeon_emit(sctx->tcs_offchip_layout);
-      radeon_emit(sctx->tcs_out_offsets);
       radeon_emit(sctx->tes_offchip_ring_va_sgpr);
       radeon_emit(sctx->current_vs_state);
    }
@@ -1214,7 +1212,9 @@ static void si_emit_vs_state(struct si_context *sctx, 
unsigned index_size)
       vs_state |= ENCODE_FIELD(VS_STATE_INDEXED, 1);
 
    /* Copy all state bits from vs_state to gs_state except the LS bits. */
-   gs_state |= vs_state & CLEAR_FIELD(VS_STATE_LS_OUT_VERTEX_SIZE);
+   gs_state |= vs_state &
+               CLEAR_FIELD(VS_STATE_TCS_OUT_PATCH0_OFFSET) &
+               CLEAR_FIELD(VS_STATE_LS_OUT_VERTEX_SIZE);
 
    if (vs_state != sctx->last_vs_state ||
        ((HAS_GS || NGG) && gs_state != sctx->last_gs_state)) {

Reply via email to