Module: Mesa Branch: main Commit: b7cd0eb439904f500b700657bb31572ed86c43b4 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=b7cd0eb439904f500b700657bb31572ed86c43b4
Author: Georg Lehmann <[email protected]> Date: Sun Feb 19 16:49:02 2023 +0100 aco: use v_permlane(x)16_b32 for masked swizzle Should be cheaper than ds_swizzle. Totals from 8 (0.01% of 134913) affected shaders: CodeSize: 16316 -> 16388 (+0.44%) Instrs: 3088 -> 3086 (-0.06%) Latency: 49558 -> 49508 (-0.10%) InvThroughput: 9180 -> 9198 (+0.20%) Copies: 376 -> 384 (+2.13%) Reviewed-by: Rhys Perry <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21412> --- src/amd/compiler/aco_instruction_selection.cpp | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 1f4547d1433..1de943bde4f 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -249,6 +249,10 @@ emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask) uint16_t dpp_ctrl = 0xffff; + /* DPP16 before DPP8 before v_permlane(x)16_b32 + * because DPP16 supports modifiers and v_permlane + * can't be folded into valu instructions. + */ if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) { unsigned res[4] = {0, 1, 2, 3}; for (unsigned i = 0; i < 4; i++) @@ -262,12 +266,22 @@ emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask) dpp_ctrl = dpp_row_half_mirror; } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x18) == 0x18 && or_mask < 8 && xor_mask < 8) { - // DPP8 comes last, as it does not allow several modifiers like `abs` that are available with DPP16 Builder::Result ret = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), src); for (unsigned i = 0; i < 8; i++) { ret->dpp8().lane_sel[i] = (((i & and_mask) | or_mask) ^ xor_mask) & 0x7; } return ret; + } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x10) == 0x10 && or_mask < 0x10) { + uint64_t lane_mask = 0; + for (unsigned i = 0; i < 16; i++) + lane_mask |= uint64_t(((i & and_mask) | or_mask) ^ (xor_mask & 0xf)) << i * 4; + aco_opcode opcode = + xor_mask & 0x10 ? aco_opcode::v_permlanex16_b32 : aco_opcode::v_permlane16_b32; + Temp op1 = bld.copy(bld.def(s1), Operand::c32(lane_mask & 0xffffffff)); + Temp op2 = bld.copy(bld.def(s1), Operand::c32(lane_mask >> 32)); + Builder::Result ret = bld.vop3(opcode, bld.def(v1), src, op1, op2); + ret->vop3().opsel = 0x3; /* set BOUND_CTRL/FETCH_INACTIVE */ + return ret; } if (dpp_ctrl != 0xffff)
