Module: Mesa Branch: main Commit: 19db6b760aa3dd1ce510e80e5567992d955cd067 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=19db6b760aa3dd1ce510e80e5567992d955cd067
Author: Pavel Ondračka <[email protected]> Date: Fri Mar 4 10:27:54 2022 +0100 r300: set PVS_LAST_VTX_SRC_INST properly to last input read >From docs: The PVS Instruction which uses the Input Vertex Memory for the last time. This value is used to free up the Input Vertex Slots ASAP. This field must be set to a valid instruction. Right now it is set to the last instruction. When the last read is inside a loop, set it on the outhermost ENDLOOP. This could in theory help performance, but none of my usual benchmarks including GLmark, Unigine Sanctuary or Lightsmark show any measurable performance difference. Suggested in: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6045 Signed-off-by: Pavel Ondračka <[email protected]> Reviewed-by: Emma Anholt <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15252> --- src/gallium/drivers/r300/compiler/r3xx_vertprog.c | 18 +++++++++++++++++- src/gallium/drivers/r300/compiler/radeon_code.h | 1 + src/gallium/drivers/r300/r300_emit.c | 2 +- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/r300/compiler/r3xx_vertprog.c b/src/gallium/drivers/r300/compiler/r3xx_vertprog.c index bc65fa4c80b..39db61e1682 100644 --- a/src/gallium/drivers/r300/compiler/r3xx_vertprog.c +++ b/src/gallium/drivers/r300/compiler/r3xx_vertprog.c @@ -371,10 +371,12 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) unsigned loops[R500_PVS_MAX_LOOP_DEPTH] = {}; unsigned loop_depth = 0; + bool last_input_read_at_loop_end = false; compiler->code->pos_end = 0; /* Not supported yet */ compiler->code->length = 0; compiler->code->num_temporaries = 0; + compiler->code->last_input_read = 0; compiler->SetHwInputOutput(compiler); @@ -448,6 +450,11 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) unsigned int last_addr; unsigned int ret_addr; + if (loop_depth == 1 && last_input_read_at_loop_end) { + compiler->code->last_input_read = compiler->code->length / 4; + last_input_read_at_loop_end = false; + } + ret_addr = loops[--loop_depth]; act_addr = ret_addr - 1; last_addr = (compiler->code->length / 4) - 1; @@ -536,10 +543,19 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) vpi->DstReg.Index >= compiler->code->num_temporaries) compiler->code->num_temporaries = vpi->DstReg.Index + 1; - for (unsigned i = 0; i < info->NumSrcRegs; i++) + for (unsigned i = 0; i < info->NumSrcRegs; i++) { if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY && vpi->SrcReg[i].Index >= compiler->code->num_temporaries) compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1; + if (vpi->SrcReg[i].File == RC_FILE_INPUT) { + if (loop_depth == 0) + compiler->code->last_input_read = compiler->code->length / 4; + else + last_input_read_at_loop_end = true; + } + + } + if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) { rc_error(&compiler->Base, "Too many temporaries.\n"); diff --git a/src/gallium/drivers/r300/compiler/radeon_code.h b/src/gallium/drivers/r300/compiler/radeon_code.h index 0c90a7ccb52..52bfab7f1f5 100644 --- a/src/gallium/drivers/r300/compiler/radeon_code.h +++ b/src/gallium/drivers/r300/compiler/radeon_code.h @@ -270,6 +270,7 @@ struct r300_vertex_program_code { int num_temporaries; /* Number of temp vars used by program */ int inputs[VSF_MAX_INPUTS]; int outputs[VSF_MAX_OUTPUTS]; + unsigned last_input_read; struct rc_constant_list constants; unsigned *constants_remap_table; diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c index 0411b950224..efb1cc792dd 100644 --- a/src/gallium/drivers/r300/r300_emit.c +++ b/src/gallium/drivers/r300/r300_emit.c @@ -1128,7 +1128,7 @@ void r300_emit_vs_state(struct r300_context* r300, unsigned size, void* state) OUT_CS_REG(R300_VAP_PVS_CODE_CNTL_0, R300_PVS_FIRST_INST(0) | R300_PVS_XYZW_VALID_INST(instruction_count - 1) | R300_PVS_LAST_INST(instruction_count - 1)); - OUT_CS_REG(R300_VAP_PVS_CODE_CNTL_1, instruction_count - 1); + OUT_CS_REG(R300_VAP_PVS_CODE_CNTL_1, code->last_input_read); OUT_CS_REG(R300_VAP_PVS_VECTOR_INDX_REG, 0); OUT_CS_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, code->length);
