https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/141589
>From e253bde72750576cab699ad1b6b872fbf60dffe9 Mon Sep 17 00:00:00 2001 From: pvanhout <pierre.vanhoutr...@amd.com> Date: Tue, 27 May 2025 11:16:16 +0200 Subject: [PATCH 1/2] [AMDGPU] Move S_BFE lowering into RegBankCombiner --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 14 +- .../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 51 +++++++ .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 125 ++++++++---------- 3 files changed, 119 insertions(+), 71 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 9587fad1ecd63..94e1175b06b14 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -151,6 +151,17 @@ def zext_of_shift_amount_combines : GICombineGroup<[ canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl ]>; +// Early select of uniform BFX into S_BFE instructions. +// These instructions encode the offset/width in a way that requires using +// bitwise operations. Selecting these instructions early allow the combiner +// to potentially fold these. +class lower_uniform_bfx<Instruction bfx> : GICombineRule< + (defs root:$bfx), + (combine (bfx $dst, $src, $o, $w):$bfx, [{ return lowerUniformBFX(*${bfx}); }])>; + +def lower_uniform_sbfx : lower_uniform_bfx<G_SBFX>; +def lower_uniform_ubfx : lower_uniform_bfx<G_UBFX>; + let Predicates = [Has16BitInsts, NotHasMed3_16] in { // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This // saves one instruction compared to the promotion. @@ -198,5 +209,6 @@ def AMDGPURegBankCombiner : GICombiner< zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, - cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> { + cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines, + lower_uniform_sbfx, lower_uniform_ubfx]> { } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index ee324a5e93f0f..2100900bb8eb2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -89,6 +89,8 @@ class AMDGPURegBankCombinerImpl : public Combiner { void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const; + bool lowerUniformBFX(MachineInstr &MI) const; + private: SIModeRegisterDefaults getMode() const; bool getIEEE() const; @@ -392,6 +394,55 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt( MI.eraseFromParent(); } +bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const { + assert(MI.getOpcode() == TargetOpcode::G_UBFX || + MI.getOpcode() == TargetOpcode::G_SBFX); + const bool Signed = (MI.getOpcode() == TargetOpcode::G_SBFX); + + Register DstReg = MI.getOperand(0).getReg(); + const RegisterBank *RB = RBI.getRegBank(DstReg, MRI, TRI); + assert(RB && "No RB?"); + if (RB->getID() != AMDGPU::SGPRRegBankID) + return false; + + Register SrcReg = MI.getOperand(1).getReg(); + Register OffsetReg = MI.getOperand(2).getReg(); + Register WidthReg = MI.getOperand(3).getReg(); + + const LLT S32 = LLT::scalar(32); + LLT Ty = MRI.getType(DstReg); + + const unsigned Opc = (Ty == S32) + ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) + : (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); + + // Ensure the high bits are clear to insert the offset. + auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); + auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); + + // Zeros out the low bits, so don't bother clamping the input value. + auto ShiftAmt = B.buildConstant(S32, 16); + auto ShiftWidth = B.buildShl(S32, WidthReg, ShiftAmt); + + // Transformation function, pack the offset and width of a BFE into + // the format expected by the S_BFE_I32 / S_BFE_U32. In the second + // source, bits [5:0] contain the offset and bits [22:16] the width. + auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); + + MRI.setRegBank(OffsetMask.getReg(0), *RB); + MRI.setRegBank(ClampOffset.getReg(0), *RB); + MRI.setRegBank(ShiftAmt.getReg(0), *RB); + MRI.setRegBank(ShiftWidth.getReg(0), *RB); + MRI.setRegBank(MergedInputs.getReg(0), *RB); + + auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); + if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) + llvm_unreachable("failed to constrain BFE"); + + MI.eraseFromParent(); + return true; +} + SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const { return MF.getInfo<SIMachineFunctionInfo>()->getMode(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index dd7aef8f0c583..0b7d64ee67c34 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1492,88 +1492,73 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B, Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg(); const RegisterBank *DstBank = - OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; - if (DstBank == &AMDGPU::VGPRRegBank) { - if (Ty == S32) - return true; - - // There is no 64-bit vgpr bitfield extract instructions so the operation - // is expanded to a sequence of instructions that implement the operation. - ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank); + OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; - const LLT S64 = LLT::scalar(64); - // Shift the source operand so that extracted bits start at bit 0. - auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg) - : B.buildLShr(S64, SrcReg, OffsetReg); - auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset); - - // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions - // if the width is a constant. - if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) { - // Use the 32-bit bitfield extract instruction if the width is a constant. - // Depending on the width size, use either the low or high 32-bits. - auto Zero = B.buildConstant(S32, 0); - auto WidthImm = ConstWidth->Value.getZExtValue(); - if (WidthImm <= 32) { - // Use bitfield extract on the lower 32-bit source, and then sign-extend - // or clear the upper 32-bits. - auto Extract = - Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg) - : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg); - auto Extend = - Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero; - B.buildMergeLikeInstr(DstReg, {Extract, Extend}); - } else { - // Use bitfield extract on upper 32-bit source, and combine with lower - // 32-bit source. - auto UpperWidth = B.buildConstant(S32, WidthImm - 32); - auto Extract = - Signed - ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth) - : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth); - B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract}); - } - MI.eraseFromParent(); + if (DstBank != &AMDGPU::VGPRRegBank) { + // SGPR: Canonicalize to a G_S/UBFX + if (!isa<GIntrinsic>(MI)) return true; - } - // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit - // operations. - auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg); - auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift); + ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank); if (Signed) - B.buildAShr(S64, SignBit, ExtShift); + B.buildSbfx(DstReg, SrcReg, OffsetReg, WidthReg); else - B.buildLShr(S64, SignBit, ExtShift); + B.buildUbfx(DstReg, SrcReg, OffsetReg, WidthReg); MI.eraseFromParent(); return true; } - // The scalar form packs the offset and width in a single operand. - - ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank); - - // Ensure the high bits are clear to insert the offset. - auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); - auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); - - // Zeros out the low bits, so don't bother clamping the input value. - auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16)); - - // Transformation function, pack the offset and width of a BFE into - // the format expected by the S_BFE_I32 / S_BFE_U32. In the second - // source, bits [5:0] contain the offset and bits [22:16] the width. - auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); + // VGPR + if (Ty == S32) + return true; - // TODO: It might be worth using a pseudo here to avoid scc clobber and - // register class constraints. - unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : - (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); + // There is no 64-bit vgpr bitfield extract instructions so the operation + // is expanded to a sequence of instructions that implement the operation. + ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank); - auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); - if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) - llvm_unreachable("failed to constrain BFE"); + const LLT S64 = LLT::scalar(64); + // Shift the source operand so that extracted bits start at bit 0. + auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg) + : B.buildLShr(S64, SrcReg, OffsetReg); + auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset); + + // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions + // if the width is a constant. + if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) { + // Use the 32-bit bitfield extract instruction if the width is a constant. + // Depending on the width size, use either the low or high 32-bits. + auto Zero = B.buildConstant(S32, 0); + auto WidthImm = ConstWidth->Value.getZExtValue(); + if (WidthImm <= 32) { + // Use bitfield extract on the lower 32-bit source, and then sign-extend + // or clear the upper 32-bits. + auto Extract = + Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg) + : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg); + auto Extend = + Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero; + B.buildMergeLikeInstr(DstReg, {Extract, Extend}); + } else { + // Use bitfield extract on upper 32-bit source, and combine with lower + // 32-bit source. + auto UpperWidth = B.buildConstant(S32, WidthImm - 32); + auto Extract = + Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth) + : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth); + B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract}); + } + MI.eraseFromParent(); + return true; + } + // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit + // operations. + auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg); + auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift); + if (Signed) + B.buildAShr(S64, SignBit, ExtShift); + else + B.buildLShr(S64, SignBit, ExtShift); MI.eraseFromParent(); return true; } >From b1dc82079cb73a1fa1a8b236692217d2a33bac27 Mon Sep 17 00:00:00 2001 From: pvanhout <pierre.vanhoutr...@amd.com> Date: Wed, 28 May 2025 10:37:09 +0200 Subject: [PATCH 2/2] style change --- llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index 2100900bb8eb2..a364887fa69d5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -412,7 +412,7 @@ bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const { const LLT S32 = LLT::scalar(32); LLT Ty = MRI.getType(DstReg); - const unsigned Opc = (Ty == S32) + const unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits