Module: Mesa Branch: main Commit: 5c35040da17b92dee64dda9956d1d7a933ae7884 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=5c35040da17b92dee64dda9956d1d7a933ae7884
Author: Timur Kristóf <[email protected]> Date: Wed Sep 1 12:47:47 2021 +0200 aco: Don't write m0 register for LDS instructions on GFX9+. Fossil DB stats on Sienna Cichlid: Totals from 2691 (2.09% of 128647) affected shaders: VGPRs: 124392 -> 124376 (-0.01%) CodeSize: 8192352 -> 8174620 (-0.22%); split: -0.22%, +0.00% MaxWaves: 61516 -> 61524 (+0.01%) Instrs: 1519774 -> 1514958 (-0.32%); split: -0.32%, +0.00% Latency: 14767555 -> 14766145 (-0.01%); split: -0.01%, +0.00% InvThroughput: 3394282 -> 3394173 (-0.00%); split: -0.01%, +0.00% VClause: 31985 -> 32002 (+0.05%); split: -0.02%, +0.07% SClause: 47581 -> 47539 (-0.09%); split: -0.14%, +0.05% Copies: 127533 -> 122709 (-3.78%); split: -3.80%, +0.02% Branches: 39395 -> 39390 (-0.01%) PreSGPRs: 84389 -> 82702 (-2.00%) PreVGPRs: 87520 -> 87519 (-0.00%) Fossil DB stats on Sienna Cichlid with NGGC on: Totals from 60930 (47.36% of 128647) affected shaders: VGPRs: 2180712 -> 2180696 (-0.00%) CodeSize: 169122736 -> 167474304 (-0.97%); split: -0.97%, +0.00% MaxWaves: 1703698 -> 1703706 (+0.00%) Instrs: 32301234 -> 31888743 (-1.28%); split: -1.28%, +0.00% Latency: 152526083 -> 152367301 (-0.10%); split: -0.10%, +0.00% InvThroughput: 25090218 -> 25089812 (-0.00%); split: -0.00%, +0.00% VClause: 577302 -> 577319 (+0.00%); split: -0.00%, +0.00% SClause: 801614 -> 801572 (-0.01%); split: -0.01%, +0.00% Copies: 3399700 -> 2987201 (-12.13%); split: -12.13%, +0.00% Branches: 1262859 -> 1262854 (-0.00%) PreSGPRs: 2175752 -> 2141331 (-1.58%) PreVGPRs: 1785088 -> 1785087 (-0.00%) Signed-off-by: Timur Kristóf <[email protected]> Reviewed-by: Daniel Schürmann <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11224> --- src/amd/compiler/README-ISA.md | 10 ++++++++++ src/amd/compiler/aco_instruction_selection.cpp | 15 ++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/amd/compiler/README-ISA.md b/src/amd/compiler/README-ISA.md index 296ba7a864a..b49e4d05083 100644 --- a/src/amd/compiler/README-ISA.md +++ b/src/amd/compiler/README-ISA.md @@ -113,6 +113,16 @@ Some instructions have a `_LEGACY` variant which implements "DX9 rules", in whic the zero "wins" in multiplications, ie. `0.0*x` is always `0.0`. The VEGA ISA mentions `V_MAC_LEGACY_F32` but this instruction is not really there on VEGA. +## `m0` with LDS instructions on Vega and newer + +The Vega ISA doc (both the old one and the "7nm" one) claims that LDS instructions +use the `m0` register for address clamping like older GPUs, but this is not the case. + +In reality, only the `_addtid` variants of LDS instructions use `m0` on Vega and +newer GPUs, so the relevant section of the RDNA ISA doc seems to apply. +LLVM also doesn't emit any initialization of `m0` for LDS instructions, and this +was also confirmed by AMD devs. + ## RDNA L0, L1 cache and DLC, GLC bits The old L1 cache was renamed to L0, and a new L1 cache was added to RDNA. The diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 9953130630c..bd81c50083c 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -3911,7 +3911,10 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info, Operand load_lds_size_m0(Builder& bld) { - /* TODO: m0 does not need to be initialized on GFX9+ */ + /* m0 does not need to be initialized on GFX9+ */ + if (bld.program->chip_class >= GFX9) + return Operand(s1); + return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu))); } @@ -3977,6 +3980,9 @@ lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned instr = bld.ds(op, Definition(val), offset, m, const_offset); instr->ds().sync = info.sync; + if (m.isUndefined()) + instr->operands.pop_back(); + return val; } @@ -4410,6 +4416,9 @@ store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmas instr = bld.ds(op, address_offset, split_data, m, inline_offset); } instr->ds().sync = memory_sync_info(storage_shared); + + if (m.isUndefined()) + instr->operands.pop_back(); } } @@ -7293,6 +7302,10 @@ visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr) if (return_previous) ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa)); ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw); + + if (m.isUndefined()) + ds->operands.pop_back(); + ctx->block->instructions.emplace_back(std::move(ds)); }
