llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> We weren't fully respecting the type of a def of an immediate vs. the type at the use point. Refactor the folding logic to track the value to fold, as well as a subregister to apply to the underlying value. This is similar to how PeepholeOpt tracks subregisters (though only for pure copy-like instructions, no constants). Fixes #<!-- -->139317 --- Patch is 34.59 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140608.diff 4 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+242-164) - (modified) llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll (+2-1) - (modified) llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.gfx942.mir (+4-4) - (modified) llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir (+2-2) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 9bbc8e75fd31b..eb7fb94e25f5c 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -25,52 +25,151 @@ using namespace llvm; namespace { -struct FoldCandidate { - MachineInstr *UseMI; +/// Track a value we may want to fold into downstream users, applying +/// subregister extracts along the way. +struct FoldableDef { union { - MachineOperand *OpToFold; + MachineOperand *OpToFold = nullptr; uint64_t ImmToFold; int FrameIndexToFold; }; - int ShrinkOpcode; - unsigned UseOpNo; + + /// Register class of the originally defined value. + const TargetRegisterClass *DefRC = nullptr; + + /// Track the original defining instruction for the value. + const MachineInstr *DefMI = nullptr; + + /// Subregister to apply to the value at the use point. + unsigned DefSubReg = AMDGPU::NoSubRegister; + + /// Kind of value stored in the union. MachineOperand::MachineOperandType Kind; - bool Commuted; - FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, - bool Commuted_ = false, - int ShrinkOp = -1) : - UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo), - Kind(FoldOp->getType()), - Commuted(Commuted_) { - if (FoldOp->isImm()) { - ImmToFold = FoldOp->getImm(); - } else if (FoldOp->isFI()) { - FrameIndexToFold = FoldOp->getIndex(); + FoldableDef() = delete; + FoldableDef(MachineOperand &FoldOp, const TargetRegisterClass *DefRC, + unsigned DefSubReg = AMDGPU::NoSubRegister) + : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.getType()) { + + if (FoldOp.isImm()) { + ImmToFold = FoldOp.getImm(); + } else if (FoldOp.isFI()) { + FrameIndexToFold = FoldOp.getIndex(); } else { - assert(FoldOp->isReg() || FoldOp->isGlobal()); - OpToFold = FoldOp; + assert(FoldOp.isReg() || FoldOp.isGlobal()); + OpToFold = &FoldOp; } + + DefMI = FoldOp.getParent(); } - FoldCandidate(MachineInstr *MI, unsigned OpNo, int64_t FoldImm, - bool Commuted_ = false, int ShrinkOp = -1) - : UseMI(MI), ImmToFold(FoldImm), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo), - Kind(MachineOperand::MO_Immediate), Commuted(Commuted_) {} + FoldableDef(int64_t FoldImm, const TargetRegisterClass *DefRC, + unsigned DefSubReg = AMDGPU::NoSubRegister) + : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg), + Kind(MachineOperand::MO_Immediate) {} + + /// Copy the current def and apply \p SubReg to the value. + FoldableDef getWithSubReg(const SIRegisterInfo &TRI, unsigned SubReg) const { + FoldableDef Copy(*this); + Copy.DefSubReg = TRI.composeSubRegIndices(DefSubReg, SubReg); + return Copy; + } + + bool isReg() const { return Kind == MachineOperand::MO_Register; } + + Register getReg() const { + assert(isReg()); + return OpToFold->getReg(); + } + + unsigned getSubReg() const { + assert(isReg()); + return OpToFold->getSubReg(); + } + + bool isImm() const { return Kind == MachineOperand::MO_Immediate; } bool isFI() const { return Kind == MachineOperand::MO_FrameIndex; } - bool isImm() const { - return Kind == MachineOperand::MO_Immediate; + int getFI() const { + assert(isFI()); + return FrameIndexToFold; } - bool isReg() const { - return Kind == MachineOperand::MO_Register; + bool isGlobal() const { return OpToFold->isGlobal(); } + + /// Return the effective immediate value defined by this instruction, after + /// application of any subregister extracts which may exist between the use + /// and def instruction. + std::optional<int64_t> getEffectiveImmVal() const { + assert(isImm()); + return SIInstrInfo::extractSubregFromImm(ImmToFold, DefSubReg); } - bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; } + /// Check if it is legal to fold this effective value into \p MI's \p OpNo + /// operand. + bool isOperandLegal(const SIInstrInfo &TII, const MachineInstr &MI, + unsigned OpIdx) const { + switch (Kind) { + case MachineOperand::MO_Immediate: { + std::optional<int64_t> ImmToFold = getEffectiveImmVal(); + if (!ImmToFold) + return false; + + // TODO: Should verify the subregister index is supported by the class + // TODO: Avoid the temporary MachineOperand + MachineOperand TmpOp = MachineOperand::CreateImm(*ImmToFold); + return TII.isOperandLegal(MI, OpIdx, &TmpOp); + } + case MachineOperand::MO_FrameIndex: { + if (DefSubReg != AMDGPU::NoSubRegister) + return false; + MachineOperand TmpOp = MachineOperand::CreateFI(FrameIndexToFold); + return TII.isOperandLegal(MI, OpIdx, &TmpOp); + } + default: + // TODO: Try to apply DefSubReg, for global address we can extract + // low/high. + if (DefSubReg != AMDGPU::NoSubRegister) + return false; + return TII.isOperandLegal(MI, OpIdx, OpToFold); + } + + llvm_unreachable("covered MachineOperand kind switch"); + } +}; + +struct FoldCandidate { + MachineInstr *UseMI; + FoldableDef Def; + int ShrinkOpcode; + unsigned UseOpNo; + bool Commuted; + + FoldCandidate(MachineInstr *MI, unsigned OpNo, FoldableDef Def, + bool Commuted_ = false, int ShrinkOp = -1) + : UseMI(MI), Def(Def), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo), + Commuted(Commuted_) {} + + bool isFI() const { return Def.Kind == MachineOperand::MO_FrameIndex; } + + int getFI() const { + assert(isFI()); + return Def.FrameIndexToFold; + } + + bool isImm() const { return Def.isImm(); } + + bool isReg() const { return Def.isReg(); } + + Register getReg() const { + assert(isReg()); + return Def.OpToFold->getReg(); + } + + bool isGlobal() const { return Def.Kind == MachineOperand::MO_GlobalAddress; } bool needsShrink() const { return ShrinkOpcode != -1; } }; @@ -84,7 +183,7 @@ class SIFoldOperandsImpl { const SIMachineFunctionInfo *MFI; bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo, - const MachineOperand &OpToFold) const; + const FoldableDef &OpToFold) const; // TODO: Just use TII::getVALUOp unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const { @@ -112,11 +211,11 @@ class SIFoldOperandsImpl { bool canUseImmWithOpSel(FoldCandidate &Fold) const; - bool tryFoldImmWithOpSel(FoldCandidate &Fold) const; + bool tryFoldImmWithOpSel(FoldCandidate &Fold, int64_t ImmVal) const; bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo, - MachineOperand *OpToFold) const; + const FoldableDef &OpToFold) const; bool isUseSafeToFold(const MachineInstr &MI, const MachineOperand &UseMO) const; @@ -135,12 +234,10 @@ class SIFoldOperandsImpl { MachineOperand *SplatVal, const TargetRegisterClass *SplatRC) const; - bool tryToFoldACImm(MachineOperand &OpToFold, MachineInstr *UseMI, + bool tryToFoldACImm(const FoldableDef &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx, SmallVectorImpl<FoldCandidate> &FoldList) const; - void foldOperand(MachineOperand &OpToFold, - MachineInstr *UseMI, - int UseOpIdx, + void foldOperand(FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx, SmallVectorImpl<FoldCandidate> &FoldList, SmallVectorImpl<MachineInstr *> &CopiesToReplace) const; @@ -148,7 +245,7 @@ class SIFoldOperandsImpl { bool tryConstantFoldOp(MachineInstr *MI) const; bool tryFoldCndMask(MachineInstr &MI) const; bool tryFoldZeroHighBits(MachineInstr &MI) const; - bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; + bool foldInstOperand(MachineInstr &MI, const FoldableDef &OpToFold) const; bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const; bool tryFoldFoldableCopy(MachineInstr &MI, @@ -240,8 +337,8 @@ static unsigned macToMad(unsigned Opc) { // TODO: Add heuristic that the frame index might not fit in the addressing mode // immediate offset to avoid materializing in loops. -bool SIFoldOperandsImpl::frameIndexMayFold( - const MachineInstr &UseMI, int OpNo, const MachineOperand &OpToFold) const { +bool SIFoldOperandsImpl::frameIndexMayFold(const MachineInstr &UseMI, int OpNo, + const FoldableDef &OpToFold) const { if (!OpToFold.isFI()) return false; @@ -381,7 +478,8 @@ bool SIFoldOperandsImpl::canUseImmWithOpSel(FoldCandidate &Fold) const { return true; } -bool SIFoldOperandsImpl::tryFoldImmWithOpSel(FoldCandidate &Fold) const { +bool SIFoldOperandsImpl::tryFoldImmWithOpSel(FoldCandidate &Fold, + int64_t ImmVal) const { MachineInstr *MI = Fold.UseMI; MachineOperand &Old = MI->getOperand(Fold.UseOpNo); unsigned Opcode = MI->getOpcode(); @@ -391,8 +489,8 @@ bool SIFoldOperandsImpl::tryFoldImmWithOpSel(FoldCandidate &Fold) const { // If the literal can be inlined as-is, apply it and short-circuit the // tests below. The main motivation for this is to avoid unintuitive // uses of opsel. - if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) { - Old.ChangeToImmediate(Fold.ImmToFold); + if (AMDGPU::isInlinableLiteralV216(ImmVal, OpType)) { + Old.ChangeToImmediate(ImmVal); return true; } @@ -415,10 +513,10 @@ bool SIFoldOperandsImpl::tryFoldImmWithOpSel(FoldCandidate &Fold) const { MachineOperand &Mod = MI->getOperand(ModIdx); unsigned ModVal = Mod.getImm(); - uint16_t ImmLo = static_cast<uint16_t>( - Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0)); - uint16_t ImmHi = static_cast<uint16_t>( - Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0)); + uint16_t ImmLo = + static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0)); + uint16_t ImmHi = + static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0)); uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo; unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1); @@ -510,16 +608,20 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { assert(Old.isReg()); if (Fold.isImm() && canUseImmWithOpSel(Fold)) { - if (tryFoldImmWithOpSel(Fold)) + std::optional<int64_t> ImmVal = Fold.Def.getEffectiveImmVal(); + if (!ImmVal) + return false; + + if (tryFoldImmWithOpSel(Fold, *ImmVal)) return true; // We can't represent the candidate as an inline constant. Try as a literal // with the original opsel, checking constant bus limitations. - MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold); + MachineOperand New = MachineOperand::CreateImm(*ImmVal); int OpNo = MI->getOperandNo(&Old); if (!TII->isOperandLegal(*MI, OpNo, &New)) return false; - Old.ChangeToImmediate(Fold.ImmToFold); + Old.ChangeToImmediate(*ImmVal); return true; } @@ -575,22 +677,34 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { MI->setDesc(TII->get(NewMFMAOpc)); MI->untieRegOperand(0); } - Old.ChangeToImmediate(Fold.ImmToFold); + + std::optional<int64_t> ImmVal = Fold.Def.getEffectiveImmVal(); + if (!ImmVal) + return false; + + // TODO: Should we try to avoid adding this to the candidate list? + MachineOperand New = MachineOperand::CreateImm(*ImmVal); + int OpNo = MI->getOperandNo(&Old); + if (!TII->isOperandLegal(*MI, OpNo, &New)) + return false; + + Old.ChangeToImmediate(*ImmVal); return true; } if (Fold.isGlobal()) { - Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(), - Fold.OpToFold->getTargetFlags()); + Old.ChangeToGA(Fold.Def.OpToFold->getGlobal(), + Fold.Def.OpToFold->getOffset(), + Fold.Def.OpToFold->getTargetFlags()); return true; } if (Fold.isFI()) { - Old.ChangeToFrameIndex(Fold.FrameIndexToFold); + Old.ChangeToFrameIndex(Fold.getFI()); return true; } - MachineOperand *New = Fold.OpToFold; + MachineOperand *New = Fold.Def.OpToFold; // Rework once the VS_16 register class is updated to include proper // 16-bit SGPRs instead of 32-bit ones. if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg())) @@ -613,26 +727,19 @@ static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList, static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo, - MachineOperand *FoldOp, bool Commuted = false, - int ShrinkOp = -1) { - appendFoldCandidate(FoldList, - FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp)); -} - -static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList, - MachineInstr *MI, unsigned OpNo, int64_t ImmVal, + const FoldableDef &FoldOp, bool Commuted = false, int ShrinkOp = -1) { appendFoldCandidate(FoldList, - FoldCandidate(MI, OpNo, ImmVal, Commuted, ShrinkOp)); + FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp)); } bool SIFoldOperandsImpl::tryAddToFoldList( SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo, - MachineOperand *OpToFold) const { + const FoldableDef &OpToFold) const { const unsigned Opc = MI->getOpcode(); auto tryToFoldAsFMAAKorMK = [&]() { - if (!OpToFold->isImm()) + if (!OpToFold.isImm()) return false; const bool TryAK = OpNo == 3; @@ -665,8 +772,8 @@ bool SIFoldOperandsImpl::tryAddToFoldList( return false; }; - bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold); - if (!IsLegal && OpToFold->isImm()) { + bool IsLegal = OpToFold.isOperandLegal(*TII, *MI, OpNo); + if (!IsLegal && OpToFold.isImm()) { FoldCandidate Fold(MI, OpNo, OpToFold); IsLegal = canUseImmWithOpSel(Fold); } @@ -700,7 +807,7 @@ bool SIFoldOperandsImpl::tryAddToFoldList( } // Special case for s_setreg_b32 - if (OpToFold->isImm()) { + if (OpToFold.isImm()) { unsigned ImmOpc = 0; if (Opc == AMDGPU::S_SETREG_B32) ImmOpc = AMDGPU::S_SETREG_IMM32_B32; @@ -741,10 +848,10 @@ bool SIFoldOperandsImpl::tryAddToFoldList( return false; int Op32 = -1; - if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) { + if (!OpToFold.isOperandLegal(*TII, *MI, CommuteOpNo)) { if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 && Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME - (!OpToFold->isImm() && !OpToFold->isFI() && !OpToFold->isGlobal())) { + (!OpToFold.isImm() && !OpToFold.isFI() && !OpToFold.isGlobal())) { TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo); return false; } @@ -763,7 +870,8 @@ bool SIFoldOperandsImpl::tryAddToFoldList( Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc); } - appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32); + appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, /*Commuted=*/true, + Op32); return true; } @@ -938,7 +1046,7 @@ MachineOperand *SIFoldOperandsImpl::tryFoldRegSeqSplat( } bool SIFoldOperandsImpl::tryToFoldACImm( - MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx, + const FoldableDef &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx, SmallVectorImpl<FoldCandidate> &FoldList) const { const MCInstrDesc &Desc = UseMI->getDesc(); if (UseOpIdx >= Desc.getNumOperands()) @@ -949,27 +1057,9 @@ bool SIFoldOperandsImpl::tryToFoldACImm( return false; MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); - if (OpToFold.isImm()) { - if (unsigned UseSubReg = UseOp.getSubReg()) { - std::optional<int64_t> SubImm = - SIInstrInfo::extractSubregFromImm(OpToFold.getImm(), UseSubReg); - if (!SubImm) - return false; - - // TODO: Avoid the temporary MachineOperand - MachineOperand TmpOp = MachineOperand::CreateImm(*SubImm); - if (TII->isOperandLegal(*UseMI, UseOpIdx, &TmpOp)) { - appendFoldCandidate(FoldList, UseMI, UseOpIdx, *SubImm); - return true; - } - - return false; - } - - if (TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) { - appendFoldCandidate(FoldList, UseMI, UseOpIdx, &OpToFold); - return true; - } + if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) { + appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold); + return true; } // TODO: Verify the following code handles subregisters correctly. @@ -985,11 +1075,17 @@ bool SIFoldOperandsImpl::tryToFoldACImm( return false; // Maybe it is just a COPY of an immediate itself. + + // FIXME: Remove this handling. There is already special case folding of + // immediate into copy in foldOperand. This is looking for the def of the + // value the folding started from in the first place. MachineInstr *Def = MRI->getVRegDef(UseReg); if (Def && TII->isFoldableCopy(*Def)) { MachineOperand &DefOp = Def->getOperand(1); if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) { - appendFoldCandidate(FoldList, UseMI, UseOpIdx, &DefOp); + FoldableDef FoldableImm(DefOp.getImm(), OpToFold.DefRC, + OpToFold.DefSubReg); + appendFoldCandidate(FoldList, UseMI, UseOpIdx, FoldableImm); return true; } } @@ -998,7 +1094,7 @@ bool SIFoldOperandsImpl::tryToFoldACImm( } void SIFoldOperandsImpl::foldOperand( - MachineOperand &OpToFold, MachineInstr *UseMI, int UseOpIdx, + FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx, SmallVectorImpl<FoldCandidate> &FoldList, SmallVectorImpl<MachineInstr *> &CopiesToReplace) const { const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx); @@ -1038,18 +1134,20 @@ void SIFoldOperandsImpl::foldOperand( if (SplatVal) { if (MachineOperand *Foldable = tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) { - appendFoldCandidate(FoldList, RSUseMI, OpNo, Foldable); + FoldableDef SplatDef(*Foldable, SplatRC); + appendFoldCandidate(FoldList, RSUseMI, OpNo, SplatDef); continue; } } + // TODO: Handle general compose if (RSUse->getSubReg() != RegSeqDstSubReg) continue; - if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList)) - continue; - - foldOperand(OpToFold, RSUseMI, OpNo, FoldList, CopiesToReplace); + // FIXME: We should avoid recursing here. There should be a cleaner split + // between the in-place mutations and adding to the fold list. + foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList, + CopiesToReplace); } return; @@ -1077,7 +1175,7 @@ void SIFoldOperandsImpl::foldOperand( // A frame index will resolve to a positive constant, so it should always be // safe to fold the addressing mode, even pre-GFX9. - UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex()); + UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI()); const unsigned Opc = UseMI->getOpcode(); if (TII->isFLATScratch(*UseMI) && @@ -1107,11 +1205,12 @@ void SIFoldOperandsImpl::foldOperand( return; const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg); - if (!DestReg.isPhysical()) { - if (DestRC == &AMDGPU::AGPR_32RegClass && - TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { + if (!DestReg.isPhysical() && DestRC == &AMDGPU::AGPR_32RegClass) { + std::optional<int64_t> UseImmVal = OpToFold.getEffectiveImmVal(); + if (UseImmVal && TII->isInlineConstant( + ... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/140608 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits