This removes some scalar loads from shaders, but it increases
the number of SET_SH_REG packets. This is currently basic but
it could be improved if needed. Inlining dynamic offsets might
also help.

Original idea from Dave Airlie.

29077 shaders in 15096 tests
Totals:
SGPRS: 1321325 -> 1357101 (2.71 %)
VGPRS: 936000 -> 932576 (-0.37 %)
Spilled SGPRs: 24804 -> 24791 (-0.05 %)
Code Size: 49827960 -> 49642232 (-0.37 %) bytes
Max Waves: 242007 -> 242700 (0.29 %)

Totals from affected shaders:
SGPRS: 290989 -> 326765 (12.29 %)
VGPRS: 244680 -> 241256 (-1.40 %)
Spilled SGPRs: 1442 -> 1429 (-0.90 %)
Code Size: 8126688 -> 7940960 (-2.29 %) bytes
Max Waves: 80952 -> 81645 (0.86 %)

v2: - check has_only_32bit_push_constants
    - handle base != 0

Signed-off-by: Samuel Pitoiset <samuel.pitoi...@gmail.com>
---
 src/amd/common/ac_nir_to_llvm.c   | 27 +++++++++--
 src/amd/common/ac_shader_abi.h    |  5 ++
 src/amd/vulkan/radv_cmd_buffer.c  | 79 +++++++++++++++++++++++--------
 src/amd/vulkan/radv_nir_to_llvm.c | 58 +++++++++++++++++++++++
 src/amd/vulkan/radv_shader.h      | 11 +++--
 5 files changed, 152 insertions(+), 28 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index efd3e260af1..58918b9570d 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1392,10 +1392,31 @@ static LLVMValueRef visit_load_push_constant(struct 
ac_nir_context *ctx,
                                              nir_intrinsic_instr *instr)
 {
        LLVMValueRef ptr, addr;
+       LLVMValueRef src0 = get_src(ctx, instr->src[0]);
+       unsigned index = nir_intrinsic_base(instr);
 
-       addr = LLVMConstInt(ctx->ac.i32, nir_intrinsic_base(instr), 0);
-       addr = LLVMBuildAdd(ctx->ac.builder, addr,
-                           get_src(ctx, instr->src[0]), "");
+       addr = LLVMConstInt(ctx->ac.i32, index, 0);
+       addr = LLVMBuildAdd(ctx->ac.builder, addr, src0, "");
+
+       /* Load constant values from user SGPRS when possible, otherwise
+        * fallback to the default path that loads directly from memory.
+        */
+       if (LLVMIsConstant(src0) &&
+           instr->dest.ssa.bit_size == 32) {
+               unsigned count = instr->dest.ssa.num_components;
+               unsigned offset = index;
+
+               offset += LLVMConstIntGetZExtValue(src0);
+               offset /= 4;
+
+               offset -= ctx->abi->base_inline_push_consts;
+
+               if (offset + count <= ctx->abi->num_inline_push_consts) {
+                       return ac_build_gather_values(&ctx->ac,
+                                                     
ctx->abi->inline_push_consts + offset,
+                                                     count);
+               }
+       }
 
        ptr = ac_build_gep0(&ctx->ac, ctx->abi->push_constants, addr);
 
diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h
index ee18e6c1923..c9b2c2eb4b8 100644
--- a/src/amd/common/ac_shader_abi.h
+++ b/src/amd/common/ac_shader_abi.h
@@ -32,6 +32,8 @@ struct nir_variable;
 
 #define AC_LLVM_MAX_OUTPUTS (VARYING_SLOT_VAR31 + 1)
 
+#define AC_MAX_INLINE_PUSH_CONSTS 8
+
 enum ac_descriptor_type {
        AC_DESC_IMAGE,
        AC_DESC_FMASK,
@@ -66,6 +68,9 @@ struct ac_shader_abi {
 
        /* Vulkan only */
        LLVMValueRef push_constants;
+       LLVMValueRef inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS];
+       unsigned num_inline_push_consts;
+       unsigned base_inline_push_consts;
        LLVMValueRef view_index;
 
        LLVMValueRef outputs[AC_LLVM_MAX_OUTPUTS * 4];
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 3b215b4b103..989372e48b7 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -628,6 +628,23 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer 
*cmd_buffer,
        }
 }
 
+static void
+radv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer,
+                            struct radv_pipeline *pipeline,
+                            gl_shader_stage stage,
+                            int idx, int count, uint32_t *values)
+{
+       struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, 
idx);
+       uint32_t base_reg = pipeline->user_data_0[stage];
+       if (loc->sgpr_idx == -1)
+               return;
+
+       assert(loc->num_sgprs == count);
+
+       radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 
count);
+       radeon_emit_array(cmd_buffer->cs, values, count);
+}
+
 static void
 radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
                              struct radv_pipeline *pipeline)
@@ -1901,6 +1918,7 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
                radv_get_descriptors_state(cmd_buffer, bind_point);
        struct radv_pipeline_layout *layout = pipeline->layout;
        struct radv_shader_variant *shader, *prev_shader;
+       bool need_push_constants = false;
        unsigned offset;
        void *ptr;
        uint64_t va;
@@ -1910,37 +1928,56 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
            (!layout->push_constant_size && !layout->dynamic_offset_count))
                return;
 
-       if (!radv_cmd_buffer_upload_alloc(cmd_buffer, 
layout->push_constant_size +
-                                         16 * layout->dynamic_offset_count,
-                                         256, &offset, &ptr))
-               return;
+       radv_foreach_stage(stage, stages) {
+               if (!pipeline->shaders[stage])
+                       continue;
 
-       memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
-       memcpy((char*)ptr + layout->push_constant_size,
-              descriptors_state->dynamic_buffers,
-              16 * layout->dynamic_offset_count);
+               need_push_constants |= 
pipeline->shaders[stage]->info.info.loads_push_constants;
+               need_push_constants |= 
pipeline->shaders[stage]->info.info.loads_dynamic_offsets;
 
-       va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
-       va += offset;
+               uint8_t base = 
pipeline->shaders[stage]->info.info.base_inline_push_consts;
+               uint8_t count = 
pipeline->shaders[stage]->info.info.num_inline_push_consts;
 
-       MAYBE_UNUSED unsigned cdw_max = 
radeon_check_space(cmd_buffer->device->ws,
-                                                          cmd_buffer->cs, 
MESA_SHADER_STAGES * 4);
+               radv_emit_inline_push_consts(cmd_buffer, pipeline, stage,
+                                            AC_UD_INLINE_PUSH_CONSTANTS,
+                                            count,
+                                            (uint32_t 
*)&cmd_buffer->push_constants[base * 4]);
+       }
 
-       prev_shader = NULL;
-       radv_foreach_stage(stage, stages) {
-               shader = radv_get_shader(pipeline, stage);
+       if (need_push_constants) {
+               if (!radv_cmd_buffer_upload_alloc(cmd_buffer, 
layout->push_constant_size +
+                                                 16 * 
layout->dynamic_offset_count,
+                                                 256, &offset, &ptr))
+                       return;
+
+               memcpy(ptr, cmd_buffer->push_constants, 
layout->push_constant_size);
+               memcpy((char*)ptr + layout->push_constant_size,
+                      descriptors_state->dynamic_buffers,
+                      16 * layout->dynamic_offset_count);
+
+               va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
+               va += offset;
+
+               MAYBE_UNUSED unsigned cdw_max =
+                       radeon_check_space(cmd_buffer->device->ws,
+                                          cmd_buffer->cs, MESA_SHADER_STAGES * 
4);
+
+               prev_shader = NULL;
+               radv_foreach_stage(stage, stages) {
+                       shader = radv_get_shader(pipeline, stage);
 
-               /* Avoid redundantly emitting the address for merged stages. */
-               if (shader && shader != prev_shader) {
-                       radv_emit_userdata_address(cmd_buffer, pipeline, stage,
-                                                  AC_UD_PUSH_CONSTANTS, va);
+                       /* Avoid redundantly emitting the address for merged 
stages. */
+                       if (shader && shader != prev_shader) {
+                               radv_emit_userdata_address(cmd_buffer, 
pipeline, stage,
+                                                          
AC_UD_PUSH_CONSTANTS, va);
 
-                       prev_shader = shader;
+                               prev_shader = shader;
+                       }
                }
+               assert(cmd_buffer->cs->cdw <= cdw_max);
        }
 
        cmd_buffer->push_constant_stages &= ~stages;
-       assert(cmd_buffer->cs->cdw <= cdw_max);
 }
 
 static void
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index 29300aeab9f..a0ce569d409 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -627,6 +627,50 @@ count_vs_user_sgprs(struct radv_shader_context *ctx)
        return count;
 }
 
+static void allocate_inline_push_consts(struct radv_shader_context *ctx,
+                                       struct user_sgpr_info *user_sgpr_info)
+{
+       uint8_t remaining_sgprs = user_sgpr_info->remaining_sgprs;
+
+       /* Only supported if shaders use push constants. */
+       if (ctx->shader_info->info.min_push_constant_used == UINT8_MAX)
+               return;
+
+       /* Only supported if shaders don't have indirect push constants. */
+       if (ctx->shader_info->info.has_indirect_push_constants)
+               return;
+
+       /* Only supported for 32-bit push constants. */
+       if (!ctx->shader_info->info.has_only_32bit_push_constants)
+               return;
+
+       uint8_t num_push_consts =
+               (ctx->shader_info->info.max_push_constant_used -
+                ctx->shader_info->info.min_push_constant_used) / 4;
+
+       /* Check if the number of user SGPRs is large enough. */
+       if (num_push_consts < remaining_sgprs) {
+               ctx->shader_info->info.num_inline_push_consts = num_push_consts;
+       } else {
+               ctx->shader_info->info.num_inline_push_consts = remaining_sgprs;
+       }
+
+       /* Clamp to the maximum number of allowed inlined push constants. */
+       if (ctx->shader_info->info.num_inline_push_consts > 
AC_MAX_INLINE_PUSH_CONSTS)
+               ctx->shader_info->info.num_inline_push_consts = 
AC_MAX_INLINE_PUSH_CONSTS;
+
+       if (ctx->shader_info->info.num_inline_push_consts == num_push_consts &&
+           !ctx->shader_info->info.loads_dynamic_offsets) {
+               /* Disable the default push constants path if all constants are
+                * inlined and if shaders don't use dynamic descriptors.
+                */
+               ctx->shader_info->info.loads_push_constants = false;
+       }
+
+       ctx->shader_info->info.base_inline_push_consts =
+               ctx->shader_info->info.min_push_constant_used / 4;
+}
+
 static void allocate_user_sgprs(struct radv_shader_context *ctx,
                                gl_shader_stage stage,
                                bool has_previous_stage,
@@ -706,6 +750,8 @@ static void allocate_user_sgprs(struct radv_shader_context 
*ctx,
        } else {
                user_sgpr_info->remaining_sgprs = remaining_sgprs - 
num_desc_set;
        }
+
+       allocate_inline_push_consts(ctx, user_sgpr_info);
 }
 
 static void
@@ -735,6 +781,13 @@ declare_global_input_sgprs(struct radv_shader_context *ctx,
                add_arg(args, ARG_SGPR, type, &ctx->abi.push_constants);
        }
 
+       for (unsigned i = 0; i < ctx->shader_info->info.num_inline_push_consts; 
i++) {
+               add_arg(args, ARG_SGPR, ctx->ac.i32,
+                       &ctx->abi.inline_push_consts[i]);
+       }
+       ctx->abi.num_inline_push_consts = 
ctx->shader_info->info.num_inline_push_consts;
+       ctx->abi.base_inline_push_consts = 
ctx->shader_info->info.base_inline_push_consts;
+
        if (ctx->shader_info->info.so.num_outputs) {
                add_arg(args, ARG_SGPR,
                        ac_array_in_const32_addr_space(ctx->ac.v4i32),
@@ -853,6 +906,11 @@ set_global_input_locs(struct radv_shader_context *ctx,
                set_loc_shader_ptr(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx);
        }
 
+       if (ctx->shader_info->info.num_inline_push_consts) {
+               set_loc_shader(ctx, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx,
+                              ctx->shader_info->info.num_inline_push_consts);
+       }
+
        if (ctx->streamout_buffers) {
                set_loc_shader_ptr(ctx, AC_UD_STREAMOUT_BUFFERS,
                               user_sgpr_idx);
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index c194401c02d..e0d27378724 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -129,10 +129,11 @@ struct radv_nir_compiler_options {
 enum radv_ud_index {
        AC_UD_SCRATCH_RING_OFFSETS = 0,
        AC_UD_PUSH_CONSTANTS = 1,
-       AC_UD_INDIRECT_DESCRIPTOR_SETS = 2,
-       AC_UD_VIEW_INDEX = 3,
-       AC_UD_STREAMOUT_BUFFERS = 4,
-       AC_UD_SHADER_START = 5,
+       AC_UD_INLINE_PUSH_CONSTANTS = 2,
+       AC_UD_INDIRECT_DESCRIPTOR_SETS = 3,
+       AC_UD_VIEW_INDEX = 4,
+       AC_UD_STREAMOUT_BUFFERS = 5,
+       AC_UD_SHADER_START = 6,
        AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
        AC_UD_VS_BASE_VERTEX_START_INSTANCE,
        AC_UD_VS_MAX_UD,
@@ -167,6 +168,8 @@ struct radv_shader_info {
        uint8_t max_push_constant_used;
        bool has_only_32bit_push_constants;
        bool has_indirect_push_constants;
+       uint8_t num_inline_push_consts;
+       uint8_t base_inline_push_consts;
        uint32_t desc_set_used_mask;
        bool needs_multiview_view_index;
        bool uses_invocation_id;
-- 
2.20.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to