RadeonSI does something similar, the VGPRs decrease is a win but not sure if we really want to implement that.
Polaris10: Totals from affected shaders: SGPRS: 116376 -> 116768 (0.34 %) VGPRS: 76556 -> 74868 (-2.20 %) Spilled SGPRs: 10347 -> 10466 (1.15 %) Code Size: 5555072 -> 5569024 (0.25 %) bytes Max Waves: 9854 -> 9951 (0.98 %) Signed-off-by: Samuel Pitoiset <samuel.pitoi...@gmail.com> --- src/amd/common/ac_nir_to_llvm.c | 118 +++++++++++++++++++++++++++++++--------- src/amd/common/ac_shader_abi.h | 7 +++ 2 files changed, 98 insertions(+), 27 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 644c85e2eb..eb0935972d 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -3131,6 +3131,7 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx, nir_intrinsic_instr *instr) { LLVMValueRef values[8]; + int location = instr->variables[0]->var->data.location; int idx = instr->variables[0]->var->data.driver_location; int ve = instr->dest.ssa.num_components; unsigned comp = instr->variables[0]->var->data.location_frac; @@ -3167,6 +3168,19 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx, instr->num_components, vertex_index, const_index, type); } + LLVMValueRef inputs[4]; + + if (ctx->stage == MESA_SHADER_FRAGMENT) { + ctx->abi->load_fs_inputs(ctx->abi, location, + indir_index, const_index, + stride, inputs); + } else { + unsigned index = idx + + (indir_index ? 0 : const_index * stride); + + memcpy(inputs, &ctx->abi->inputs[index], sizeof(inputs)); + } + for (unsigned chan = comp; chan < ve + comp; chan++) { if (indir_index) { unsigned count = glsl_count_attribute_slots( @@ -3174,14 +3188,15 @@ static LLVMValueRef visit_load_var(struct ac_nir_context *ctx, ctx->stage == MESA_SHADER_VERTEX); count -= chan / 4; LLVMValueRef tmp_vec = ac_build_gather_values_extended( - &ctx->ac, ctx->abi->inputs + idx + chan, count, + &ctx->ac, inputs + chan, count, stride, false, true); values[chan] = LLVMBuildExtractElement(ctx->ac.builder, tmp_vec, indir_index, ""); - } else - values[chan] = ctx->abi->inputs[idx + chan + const_index * stride]; + } else { + values[chan] = inputs[chan]; + } } break; case nir_var_local: @@ -5556,45 +5571,93 @@ prepare_interp_optimize(struct radv_shader_context *ctx, } } +static unsigned +get_input_hw_index(struct radv_shader_context *ctx, unsigned idx) +{ + struct ac_shader_info *info = &ctx->shader_info->info; + uint64_t mask = info->input_mask & ((1ull << idx) - 1); + + mask &= ~(1ull << VARYING_SLOT_POS); + + return util_bitcount64(mask); +} + +/* If this is true, preload FS inputs at the beginning of shaders. Otherwise, + * reload them at each use. This must be true if the shader is using + * derivatives and KILL, because KILL can leave the WQM and then a lazy + * input load isn't in the WQM anymore. + */ +static bool +radv_preload_fs_inputs(struct radv_shader_context *ctx) +{ + return ctx->shader_info->info.ps.uses_derivatives && + ctx->shader_info->info.ps.uses_kill; +} + static void -handle_fs_inputs(struct radv_shader_context *ctx, - struct nir_shader *nir) +radv_load_fs_inputs(struct radv_shader_context *ctx, unsigned idx, + LLVMValueRef out[4]) { struct ac_shader_info *info = &ctx->shader_info->info; + if (idx >= VARYING_SLOT_VAR0 || + idx == VARYING_SLOT_PNTC || + idx == VARYING_SLOT_PRIMITIVE_ID || + idx == VARYING_SLOT_LAYER) { + unsigned interp_mode = info->ps.input_interp_mode[idx]; + unsigned interp_loc = info->ps.input_interp_loc[idx]; + unsigned hw_index = get_input_hw_index(ctx, idx); + LLVMValueRef interp_param = + lookup_interp_param(&ctx->abi, interp_mode, interp_loc); + + interp_fs_input(ctx, hw_index, interp_param, ctx->abi.prim_mask, + &out[0]); + } else if (idx == VARYING_SLOT_POS) { + for (int i = 0; i < 3; ++i) + out[i] = ctx->abi.frag_pos[i]; + + out[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, + ctx->abi.frag_pos[3]); + } +} + +static void +load_fs_inputs(struct ac_shader_abi *abi, + unsigned location, + LLVMValueRef indir_index, + unsigned const_index, + unsigned stride, + LLVMValueRef out[4]) +{ + struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); + + if (!radv_preload_fs_inputs(ctx)) { + radv_load_fs_inputs(ctx, location, out); + } else { + unsigned index = radeon_llvm_reg_index_soa(location, 0); + + index += (indir_index ? 0 : const_index * stride); + + memcpy(out, &abi->inputs[index], sizeof(out[0]) * 4); + } +} + +static void +handle_fs_inputs(struct radv_shader_context *ctx, + struct nir_shader *nir) +{ prepare_interp_optimize(ctx, nir); nir_foreach_variable(variable, &nir->inputs) handle_fs_input_decl(ctx, variable); - unsigned index = 0; - for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) { - LLVMValueRef interp_param; LLVMValueRef *inputs = ctx->inputs +radeon_llvm_reg_index_soa(i, 0); if (!(ctx->shader_info->info.input_mask & (1ull << i))) continue; - if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC || - i == VARYING_SLOT_PRIMITIVE_ID || i == VARYING_SLOT_LAYER) { - unsigned interp_mode = info->ps.input_interp_mode[i]; - unsigned interp_loc = info->ps.input_interp_loc[i]; - - interp_param = lookup_interp_param(&ctx->abi, interp_mode, - interp_loc); - - interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask, - inputs); - - ++index; - } else if (i == VARYING_SLOT_POS) { - for(int i = 0; i < 3; ++i) - inputs[i] = ctx->abi.frag_pos[i]; - - inputs[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, - ctx->abi.frag_pos[3]); - } + radv_load_fs_inputs(ctx, i, inputs); } if (ctx->shader_info->info.needs_multiview_view_index) @@ -6924,6 +6987,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm, ctx.abi.load_base_vertex = radv_load_base_vertex; } else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT) { shader_info->fs.can_discard = shaders[i]->info.fs.uses_discard; + ctx.abi.load_fs_inputs = load_fs_inputs; ctx.abi.lookup_interp_param = lookup_interp_param; ctx.abi.load_sample_position = load_sample_position; ctx.abi.load_sample_mask_in = load_sample_mask_in; diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h index 901e49b1f9..8e51ce9fdd 100644 --- a/src/amd/common/ac_shader_abi.h +++ b/src/amd/common/ac_shader_abi.h @@ -97,6 +97,13 @@ struct ac_shader_abi { unsigned const_index, LLVMTypeRef type); + void (*load_fs_inputs)(struct ac_shader_abi *abi, + unsigned location, + LLVMValueRef indir_index, + unsigned const_index, + unsigned stride, + LLVMValueRef out[4]); + LLVMValueRef (*load_tess_varyings)(struct ac_shader_abi *abi, LLVMTypeRef type, LLVMValueRef vertex_index, -- 2.16.2 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev