nir: do not always preload PS inputs at beginning

Samuel Pitoiset Thu, 08 Mar 2018 06:08:24 -0800

RadeonSI does something similar, the VGPRs decrease is a win
but not sure if we really want to implement that.


Polaris10:
Totals from affected shaders:
SGPRS: 116376 -> 116768 (0.34 %)
VGPRS: 76556 -> 74868 (-2.20 %)
Spilled SGPRs: 10347 -> 10466 (1.15 %)
Code Size: 5555072 -> 5569024 (0.25 %) bytes
Max Waves: 9854 -> 9951 (0.98 %)

Signed-off-by: Samuel Pitoiset <samuel.pitoi...@gmail.com>
---
 src/amd/common/ac_nir_to_llvm.c | 118 +++++++++++++++++++++++++++++++---------
 src/amd/common/ac_shader_abi.h  |   7 +++
 2 files changed, 98 insertions(+), 27 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 644c85e2eb..eb0935972d 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -3131,6 +3131,7 @@ static LLVMValueRef visit_load_var(struct ac_nir_context 
*ctx,
                                   nir_intrinsic_instr *instr)
 {
        LLVMValueRef values[8];
+       int location = instr->variables[0]->var->data.location;
        int idx = instr->variables[0]->var->data.driver_location;
        int ve = instr->dest.ssa.num_components;
        unsigned comp = instr->variables[0]->var->data.location_frac;
@@ -3167,6 +3168,19 @@ static LLVMValueRef visit_load_var(struct ac_nir_context 
*ctx,
                                                     instr->num_components, 
vertex_index, const_index, type);
                }
 
+               LLVMValueRef inputs[4];
+
+               if (ctx->stage == MESA_SHADER_FRAGMENT) {
+                       ctx->abi->load_fs_inputs(ctx->abi, location,
+                                                indir_index, const_index,
+                                                stride, inputs);
+               } else {
+                       unsigned index = idx +
+                               (indir_index ? 0 : const_index * stride);
+
+                       memcpy(inputs, &ctx->abi->inputs[index], 
sizeof(inputs));
+               }
+
                for (unsigned chan = comp; chan < ve + comp; chan++) {
                        if (indir_index) {
                                unsigned count = glsl_count_attribute_slots(
@@ -3174,14 +3188,15 @@ static LLVMValueRef visit_load_var(struct 
ac_nir_context *ctx,
                                                ctx->stage == 
MESA_SHADER_VERTEX);
                                count -= chan / 4;
                                LLVMValueRef tmp_vec = 
ac_build_gather_values_extended(
-                                               &ctx->ac, ctx->abi->inputs + 
idx + chan, count,
+                                               &ctx->ac, inputs + chan, count,
                                                stride, false, true);
 
                                values[chan] = 
LLVMBuildExtractElement(ctx->ac.builder,
                                                                       tmp_vec,
                                                                       
indir_index, "");
-                       } else
-                               values[chan] = ctx->abi->inputs[idx + chan + 
const_index * stride];
+                       } else {
+                               values[chan] = inputs[chan];
+                       }
                }
                break;
        case nir_var_local:
@@ -5556,45 +5571,93 @@ prepare_interp_optimize(struct radv_shader_context *ctx,
        }
 }
 
+static unsigned
+get_input_hw_index(struct radv_shader_context *ctx, unsigned idx)
+{
+       struct ac_shader_info *info = &ctx->shader_info->info;
+       uint64_t mask = info->input_mask & ((1ull << idx) - 1);
+
+       mask &= ~(1ull << VARYING_SLOT_POS);
+
+       return util_bitcount64(mask);
+}
+
+/* If this is true, preload FS inputs at the beginning of shaders. Otherwise,
+ * reload them at each use. This must be true if the shader is using
+ * derivatives and KILL, because KILL can leave the WQM and then a lazy
+ * input load isn't in the WQM anymore.
+ */
+static bool
+radv_preload_fs_inputs(struct radv_shader_context *ctx)
+{
+       return ctx->shader_info->info.ps.uses_derivatives &&
+              ctx->shader_info->info.ps.uses_kill;
+}
+
 static void
-handle_fs_inputs(struct radv_shader_context *ctx,
-                 struct nir_shader *nir)
+radv_load_fs_inputs(struct radv_shader_context *ctx, unsigned idx,
+                   LLVMValueRef out[4])
 {
        struct ac_shader_info *info = &ctx->shader_info->info;
 
+       if (idx >= VARYING_SLOT_VAR0 ||
+           idx == VARYING_SLOT_PNTC ||
+           idx == VARYING_SLOT_PRIMITIVE_ID ||
+           idx == VARYING_SLOT_LAYER) {
+               unsigned interp_mode = info->ps.input_interp_mode[idx];
+               unsigned interp_loc = info->ps.input_interp_loc[idx];
+               unsigned hw_index = get_input_hw_index(ctx, idx);
+               LLVMValueRef interp_param =
+                       lookup_interp_param(&ctx->abi, interp_mode, interp_loc);
+
+               interp_fs_input(ctx, hw_index, interp_param, ctx->abi.prim_mask,
+                               &out[0]);
+       } else if (idx == VARYING_SLOT_POS) {
+               for (int i = 0; i < 3; ++i)
+                       out[i] = ctx->abi.frag_pos[i];
+
+               out[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
+                                      ctx->abi.frag_pos[3]);
+       }
+}
+
+static void
+load_fs_inputs(struct ac_shader_abi *abi,
+              unsigned location,
+              LLVMValueRef indir_index,
+              unsigned const_index,
+              unsigned stride,
+              LLVMValueRef out[4])
+{
+       struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
+
+       if (!radv_preload_fs_inputs(ctx)) {
+               radv_load_fs_inputs(ctx, location, out);
+       } else {
+               unsigned index = radeon_llvm_reg_index_soa(location, 0);
+
+               index += (indir_index ? 0 : const_index * stride);
+
+               memcpy(out, &abi->inputs[index], sizeof(out[0]) * 4);
+       }
+}
+
+static void
+handle_fs_inputs(struct radv_shader_context *ctx,
+                 struct nir_shader *nir)
+{
        prepare_interp_optimize(ctx, nir);
 
        nir_foreach_variable(variable, &nir->inputs)
                handle_fs_input_decl(ctx, variable);
 
-       unsigned index = 0;
-
        for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) {
-               LLVMValueRef interp_param;
                LLVMValueRef *inputs = ctx->inputs 
+radeon_llvm_reg_index_soa(i, 0);
 
                if (!(ctx->shader_info->info.input_mask & (1ull << i)))
                        continue;
 
-               if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC ||
-                   i == VARYING_SLOT_PRIMITIVE_ID || i == VARYING_SLOT_LAYER) {
-                       unsigned interp_mode = info->ps.input_interp_mode[i];
-                       unsigned interp_loc = info->ps.input_interp_loc[i];
-
-                       interp_param = lookup_interp_param(&ctx->abi, 
interp_mode,
-                                                          interp_loc);
-
-                       interp_fs_input(ctx, index, interp_param, 
ctx->abi.prim_mask,
-                                       inputs);
-
-                       ++index;
-               } else if (i == VARYING_SLOT_POS) {
-                       for(int i = 0; i < 3; ++i)
-                               inputs[i] = ctx->abi.frag_pos[i];
-
-                       inputs[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
-                                                 ctx->abi.frag_pos[3]);
-               }
+               radv_load_fs_inputs(ctx, i, inputs);
        }
 
        if (ctx->shader_info->info.needs_multiview_view_index)
@@ -6924,6 +6987,7 @@ LLVMModuleRef 
ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
                        ctx.abi.load_base_vertex = radv_load_base_vertex;
                } else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT) {
                        shader_info->fs.can_discard = 
shaders[i]->info.fs.uses_discard;
+                       ctx.abi.load_fs_inputs = load_fs_inputs;
                        ctx.abi.lookup_interp_param = lookup_interp_param;
                        ctx.abi.load_sample_position = load_sample_position;
                        ctx.abi.load_sample_mask_in = load_sample_mask_in;
diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h
index 901e49b1f9..8e51ce9fdd 100644
--- a/src/amd/common/ac_shader_abi.h
+++ b/src/amd/common/ac_shader_abi.h
@@ -97,6 +97,13 @@ struct ac_shader_abi {
                                    unsigned const_index,
                                    LLVMTypeRef type);
 
+       void (*load_fs_inputs)(struct ac_shader_abi *abi,
+                              unsigned location,
+                              LLVMValueRef indir_index,
+                              unsigned const_index,
+                              unsigned stride,
+                              LLVMValueRef out[4]);
+
        LLVMValueRef (*load_tess_varyings)(struct ac_shader_abi *abi,
                                           LLVMTypeRef type,
                                           LLVMValueRef vertex_index,
-- 
2.16.2

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [RFC PATCH 9/9] ac/nir: do not always preload PS inputs at beginning

Reply via email to