https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/140608
>From 4ec9c56a126192e016127e5fa5aa4416d26bcac8 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Thu, 15 May 2025 10:51:39 +0200 Subject: [PATCH] AMDGPU: Fix tracking subreg defs when folding through reg_sequence We weren't fully respecting the type of a def of an immediate vs. the type at the use point. Refactor the folding logic to track the value to fold, as well as a subregister to apply to the underlying value. This is similar to how PeepholeOpt tracks subregisters (though only for pure copy-like instructions, no constants). Fixes #139317 --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 406 +++++++++++------- ...issue139317-bad-opsel-reg-sequence-fold.ll | 3 +- .../si-fold-operands-subreg-imm.gfx942.mir | 8 +- .../AMDGPU/si-fold-operands-subreg-imm.mir | 4 +- 4 files changed, 250 insertions(+), 171 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 9bbc8e75fd31b..eb7fb94e25f5c 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -25,52 +25,151 @@ using namespace llvm; namespace { -struct FoldCandidate { - MachineInstr *UseMI; +/// Track a value we may want to fold into downstream users, applying +/// subregister extracts along the way. +struct FoldableDef { union { - MachineOperand *OpToFold; + MachineOperand *OpToFold = nullptr; uint64_t ImmToFold; int FrameIndexToFold; }; - int ShrinkOpcode; - unsigned UseOpNo; + + /// Register class of the originally defined value. + const TargetRegisterClass *DefRC = nullptr; + + /// Track the original defining instruction for the value. + const MachineInstr *DefMI = nullptr; + + /// Subregister to apply to the value at the use point. + unsigned DefSubReg = AMDGPU::NoSubRegister; + + /// Kind of value stored in the union. MachineOperand::MachineOperandType Kind; - bool Commuted; - FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, - bool Commuted_ = false, - int ShrinkOp = -1) : - UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo), - Kind(FoldOp->getType()), - Commuted(Commuted_) { - if (FoldOp->isImm()) { - ImmToFold = FoldOp->getImm(); - } else if (FoldOp->isFI()) { - FrameIndexToFold = FoldOp->getIndex(); + FoldableDef() = delete; + FoldableDef(MachineOperand &FoldOp, const TargetRegisterClass *DefRC, + unsigned DefSubReg = AMDGPU::NoSubRegister) + : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.getType()) { + + if (FoldOp.isImm()) { + ImmToFold = FoldOp.getImm(); + } else if (FoldOp.isFI()) { + FrameIndexToFold = FoldOp.getIndex(); } else { - assert(FoldOp->isReg() || FoldOp->isGlobal()); - OpToFold = FoldOp; + assert(FoldOp.isReg() || FoldOp.isGlobal()); + OpToFold = &FoldOp; } + + DefMI = FoldOp.getParent(); } - FoldCandidate(MachineInstr *MI, unsigned OpNo, int64_t FoldImm, - bool Commuted_ = false, int ShrinkOp = -1) - : UseMI(MI), ImmToFold(FoldImm), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo), - Kind(MachineOperand::MO_Immediate), Commuted(Commuted_) {} + FoldableDef(int64_t FoldImm, const TargetRegisterClass *DefRC, + unsigned DefSubReg = AMDGPU::NoSubRegister) + : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg), + Kind(MachineOperand::MO_Immediate) {} + + /// Copy the current def and apply \p SubReg to the value. + FoldableDef getWithSubReg(const SIRegisterInfo &TRI, unsigned SubReg) const { + FoldableDef Copy(*this); + Copy.DefSubReg = TRI.composeSubRegIndices(DefSubReg, SubReg); + return Copy; + } + + bool isReg() const { return Kind == MachineOperand::MO_Register; } + + Register getReg() const { + assert(isReg()); + return OpToFold->getReg(); + } + + unsigned getSubReg() const { + assert(isReg()); + return OpToFold->getSubReg(); + } + + bool isImm() const { return Kind == MachineOperand::MO_Immediate; } bool isFI() const { return Kind == MachineOperand::MO_FrameIndex; } - bool isImm() const { - return Kind == MachineOperand::MO_Immediate; + int getFI() const { + assert(isFI()); + return FrameIndexToFold; } - bool isReg() const { - return Kind == MachineOperand::MO_Register; + bool isGlobal() const { return OpToFold->isGlobal(); } + + /// Return the effective immediate value defined by this instruction, after + /// application of any subregister extracts which may exist between the use + /// and def instruction. + std::optional<int64_t> getEffectiveImmVal() const { + assert(isImm()); + return SIInstrInfo::extractSubregFromImm(ImmToFold, DefSubReg); } - bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; } + /// Check if it is legal to fold this effective value into \p MI's \p OpNo + /// operand. + bool isOperandLegal(const SIInstrInfo &TII, const MachineInstr &MI, + unsigned OpIdx) const { + switch (Kind) { + case MachineOperand::MO_Immediate: { + std::optional<int64_t> ImmToFold = getEffectiveImmVal(); + if (!ImmToFold) + return false; + + // TODO: Should verify the subregister index is supported by the class + // TODO: Avoid the temporary MachineOperand + MachineOperand TmpOp = MachineOperand::CreateImm(*ImmToFold); + return TII.isOperandLegal(MI, OpIdx, &TmpOp); + } + case MachineOperand::MO_FrameIndex: { + if (DefSubReg != AMDGPU::NoSubRegister) + return false; + MachineOperand TmpOp = MachineOperand::CreateFI(FrameIndexToFold); + return TII.isOperandLegal(MI, OpIdx, &TmpOp); + } + default: + // TODO: Try to apply DefSubReg, for global address we can extract + // low/high. + if (DefSubReg != AMDGPU::NoSubRegister) + return false; + return TII.isOperandLegal(MI, OpIdx, OpToFold); + } + + llvm_unreachable("covered MachineOperand kind switch"); + } +}; + +struct FoldCandidate { + MachineInstr *UseMI; + FoldableDef Def; + int ShrinkOpcode; + unsigned UseOpNo; + bool Commuted; + + FoldCandidate(MachineInstr *MI, unsigned OpNo, FoldableDef Def, + bool Commuted_ = false, int ShrinkOp = -1) + : UseMI(MI), Def(Def), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo), + Commuted(Commuted_) {} + + bool isFI() const { return Def.Kind == MachineOperand::MO_FrameIndex; } + + int getFI() const { + assert(isFI()); + return Def.FrameIndexToFold; + } + + bool isImm() const { return Def.isImm(); } + + bool isReg() const { return Def.isReg(); } + + Register getReg() const { + assert(isReg()); + return Def.OpToFold->getReg(); + } + + bool isGlobal() const { return Def.Kind == MachineOperand::MO_GlobalAddress; } bool needsShrink() const { return ShrinkOpcode != -1; } }; @@ -84,7 +183,7 @@ class SIFoldOperandsImpl { const SIMachineFunctionInfo *MFI; bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo, - const MachineOperand &OpToFold) const; + const FoldableDef &OpToFold) const; // TODO: Just use TII::getVALUOp unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const { @@ -112,11 +211,11 @@ class SIFoldOperandsImpl { bool canUseImmWithOpSel(FoldCandidate &Fold) const; - bool tryFoldImmWithOpSel(FoldCandidate &Fold) const; + bool tryFoldImmWithOpSel(FoldCandidate &Fold, int64_t ImmVal) const; bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo, - MachineOperand *OpToFold) const; + const FoldableDef &OpToFold) const; bool isUseSafeToFold(const MachineInstr &MI, const MachineOperand &UseMO) const; @@ -135,12 +234,10 @@ class SIFoldOperandsImpl { MachineOperand *SplatVal, const TargetRegisterClass *SplatRC) const; - bool tryToFoldACImm(MachineOperand &OpToFold, MachineInstr *UseMI, + bool tryToFoldACImm(const FoldableDef &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx, SmallVectorImpl<FoldCandidate> &FoldList) const; - void foldOperand(MachineOperand &OpToFold, - MachineInstr *UseMI, - int UseOpIdx, + void foldOperand(FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx, SmallVectorImpl<FoldCandidate> &FoldList, SmallVectorImpl<MachineInstr *> &CopiesToReplace) const; @@ -148,7 +245,7 @@ class SIFoldOperandsImpl { bool tryConstantFoldOp(MachineInstr *MI) const; bool tryFoldCndMask(MachineInstr &MI) const; bool tryFoldZeroHighBits(MachineInstr &MI) const; - bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; + bool foldInstOperand(MachineInstr &MI, const FoldableDef &OpToFold) const; bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const; bool tryFoldFoldableCopy(MachineInstr &MI, @@ -240,8 +337,8 @@ static unsigned macToMad(unsigned Opc) { // TODO: Add heuristic that the frame index might not fit in the addressing mode // immediate offset to avoid materializing in loops. -bool SIFoldOperandsImpl::frameIndexMayFold( - const MachineInstr &UseMI, int OpNo, const MachineOperand &OpToFold) const { +bool SIFoldOperandsImpl::frameIndexMayFold(const MachineInstr &UseMI, int OpNo, + const FoldableDef &OpToFold) const { if (!OpToFold.isFI()) return false; @@ -381,7 +478,8 @@ bool SIFoldOperandsImpl::canUseImmWithOpSel(FoldCandidate &Fold) const { return true; } -bool SIFoldOperandsImpl::tryFoldImmWithOpSel(FoldCandidate &Fold) const { +bool SIFoldOperandsImpl::tryFoldImmWithOpSel(FoldCandidate &Fold, + int64_t ImmVal) const { MachineInstr *MI = Fold.UseMI; MachineOperand &Old = MI->getOperand(Fold.UseOpNo); unsigned Opcode = MI->getOpcode(); @@ -391,8 +489,8 @@ bool SIFoldOperandsImpl::tryFoldImmWithOpSel(FoldCandidate &Fold) const { // If the literal can be inlined as-is, apply it and short-circuit the // tests below. The main motivation for this is to avoid unintuitive // uses of opsel. - if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) { - Old.ChangeToImmediate(Fold.ImmToFold); + if (AMDGPU::isInlinableLiteralV216(ImmVal, OpType)) { + Old.ChangeToImmediate(ImmVal); return true; } @@ -415,10 +513,10 @@ bool SIFoldOperandsImpl::tryFoldImmWithOpSel(FoldCandidate &Fold) const { MachineOperand &Mod = MI->getOperand(ModIdx); unsigned ModVal = Mod.getImm(); - uint16_t ImmLo = static_cast<uint16_t>( - Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0)); - uint16_t ImmHi = static_cast<uint16_t>( - Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0)); + uint16_t ImmLo = + static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0)); + uint16_t ImmHi = + static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0)); uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo; unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1); @@ -510,16 +608,20 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { assert(Old.isReg()); if (Fold.isImm() && canUseImmWithOpSel(Fold)) { - if (tryFoldImmWithOpSel(Fold)) + std::optional<int64_t> ImmVal = Fold.Def.getEffectiveImmVal(); + if (!ImmVal) + return false; + + if (tryFoldImmWithOpSel(Fold, *ImmVal)) return true; // We can't represent the candidate as an inline constant. Try as a literal // with the original opsel, checking constant bus limitations. - MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold); + MachineOperand New = MachineOperand::CreateImm(*ImmVal); int OpNo = MI->getOperandNo(&Old); if (!TII->isOperandLegal(*MI, OpNo, &New)) return false; - Old.ChangeToImmediate(Fold.ImmToFold); + Old.ChangeToImmediate(*ImmVal); return true; } @@ -575,22 +677,34 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { MI->setDesc(TII->get(NewMFMAOpc)); MI->untieRegOperand(0); } - Old.ChangeToImmediate(Fold.ImmToFold); + + std::optional<int64_t> ImmVal = Fold.Def.getEffectiveImmVal(); + if (!ImmVal) + return false; + + // TODO: Should we try to avoid adding this to the candidate list? + MachineOperand New = MachineOperand::CreateImm(*ImmVal); + int OpNo = MI->getOperandNo(&Old); + if (!TII->isOperandLegal(*MI, OpNo, &New)) + return false; + + Old.ChangeToImmediate(*ImmVal); return true; } if (Fold.isGlobal()) { - Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(), - Fold.OpToFold->getTargetFlags()); + Old.ChangeToGA(Fold.Def.OpToFold->getGlobal(), + Fold.Def.OpToFold->getOffset(), + Fold.Def.OpToFold->getTargetFlags()); return true; } if (Fold.isFI()) { - Old.ChangeToFrameIndex(Fold.FrameIndexToFold); + Old.ChangeToFrameIndex(Fold.getFI()); return true; } - MachineOperand *New = Fold.OpToFold; + MachineOperand *New = Fold.Def.OpToFold; // Rework once the VS_16 register class is updated to include proper // 16-bit SGPRs instead of 32-bit ones. if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg())) @@ -613,26 +727,19 @@ static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList, static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo, - MachineOperand *FoldOp, bool Commuted = false, - int ShrinkOp = -1) { - appendFoldCandidate(FoldList, - FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp)); -} - -static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList, - MachineInstr *MI, unsigned OpNo, int64_t ImmVal, + const FoldableDef &FoldOp, bool Commuted = false, int ShrinkOp = -1) { appendFoldCandidate(FoldList, - FoldCandidate(MI, OpNo, ImmVal, Commuted, ShrinkOp)); + FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp)); } bool SIFoldOperandsImpl::tryAddToFoldList( SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo, - MachineOperand *OpToFold) const { + const FoldableDef &OpToFold) const { const unsigned Opc = MI->getOpcode(); auto tryToFoldAsFMAAKorMK = [&]() { - if (!OpToFold->isImm()) + if (!OpToFold.isImm()) return false; const bool TryAK = OpNo == 3; @@ -665,8 +772,8 @@ bool SIFoldOperandsImpl::tryAddToFoldList( return false; }; - bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold); - if (!IsLegal && OpToFold->isImm()) { + bool IsLegal = OpToFold.isOperandLegal(*TII, *MI, OpNo); + if (!IsLegal && OpToFold.isImm()) { FoldCandidate Fold(MI, OpNo, OpToFold); IsLegal = canUseImmWithOpSel(Fold); } @@ -700,7 +807,7 @@ bool SIFoldOperandsImpl::tryAddToFoldList( } // Special case for s_setreg_b32 - if (OpToFold->isImm()) { + if (OpToFold.isImm()) { unsigned ImmOpc = 0; if (Opc == AMDGPU::S_SETREG_B32) ImmOpc = AMDGPU::S_SETREG_IMM32_B32; @@ -741,10 +848,10 @@ bool SIFoldOperandsImpl::tryAddToFoldList( return false; int Op32 = -1; - if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) { + if (!OpToFold.isOperandLegal(*TII, *MI, CommuteOpNo)) { if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 && Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME - (!OpToFold->isImm() && !OpToFold->isFI() && !OpToFold->isGlobal())) { + (!OpToFold.isImm() && !OpToFold.isFI() && !OpToFold.isGlobal())) { TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo); return false; } @@ -763,7 +870,8 @@ bool SIFoldOperandsImpl::tryAddToFoldList( Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc); } - appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32); + appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, /*Commuted=*/true, + Op32); return true; } @@ -938,7 +1046,7 @@ MachineOperand *SIFoldOperandsImpl::tryFoldRegSeqSplat( } bool SIFoldOperandsImpl::tryToFoldACImm( - MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx, + const FoldableDef &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx, SmallVectorImpl<FoldCandidate> &FoldList) const { const MCInstrDesc &Desc = UseMI->getDesc(); if (UseOpIdx >= Desc.getNumOperands()) @@ -949,27 +1057,9 @@ bool SIFoldOperandsImpl::tryToFoldACImm( return false; MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); - if (OpToFold.isImm()) { - if (unsigned UseSubReg = UseOp.getSubReg()) { - std::optional<int64_t> SubImm = - SIInstrInfo::extractSubregFromImm(OpToFold.getImm(), UseSubReg); - if (!SubImm) - return false; - - // TODO: Avoid the temporary MachineOperand - MachineOperand TmpOp = MachineOperand::CreateImm(*SubImm); - if (TII->isOperandLegal(*UseMI, UseOpIdx, &TmpOp)) { - appendFoldCandidate(FoldList, UseMI, UseOpIdx, *SubImm); - return true; - } - - return false; - } - - if (TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) { - appendFoldCandidate(FoldList, UseMI, UseOpIdx, &OpToFold); - return true; - } + if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) { + appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold); + return true; } // TODO: Verify the following code handles subregisters correctly. @@ -985,11 +1075,17 @@ bool SIFoldOperandsImpl::tryToFoldACImm( return false; // Maybe it is just a COPY of an immediate itself. + + // FIXME: Remove this handling. There is already special case folding of + // immediate into copy in foldOperand. This is looking for the def of the + // value the folding started from in the first place. MachineInstr *Def = MRI->getVRegDef(UseReg); if (Def && TII->isFoldableCopy(*Def)) { MachineOperand &DefOp = Def->getOperand(1); if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) { - appendFoldCandidate(FoldList, UseMI, UseOpIdx, &DefOp); + FoldableDef FoldableImm(DefOp.getImm(), OpToFold.DefRC, + OpToFold.DefSubReg); + appendFoldCandidate(FoldList, UseMI, UseOpIdx, FoldableImm); return true; } } @@ -998,7 +1094,7 @@ bool SIFoldOperandsImpl::tryToFoldACImm( } void SIFoldOperandsImpl::foldOperand( - MachineOperand &OpToFold, MachineInstr *UseMI, int UseOpIdx, + FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx, SmallVectorImpl<FoldCandidate> &FoldList, SmallVectorImpl<MachineInstr *> &CopiesToReplace) const { const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx); @@ -1038,18 +1134,20 @@ void SIFoldOperandsImpl::foldOperand( if (SplatVal) { if (MachineOperand *Foldable = tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) { - appendFoldCandidate(FoldList, RSUseMI, OpNo, Foldable); + FoldableDef SplatDef(*Foldable, SplatRC); + appendFoldCandidate(FoldList, RSUseMI, OpNo, SplatDef); continue; } } + // TODO: Handle general compose if (RSUse->getSubReg() != RegSeqDstSubReg) continue; - if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList)) - continue; - - foldOperand(OpToFold, RSUseMI, OpNo, FoldList, CopiesToReplace); + // FIXME: We should avoid recursing here. There should be a cleaner split + // between the in-place mutations and adding to the fold list. + foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList, + CopiesToReplace); } return; @@ -1077,7 +1175,7 @@ void SIFoldOperandsImpl::foldOperand( // A frame index will resolve to a positive constant, so it should always be // safe to fold the addressing mode, even pre-GFX9. - UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex()); + UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI()); const unsigned Opc = UseMI->getOpcode(); if (TII->isFLATScratch(*UseMI) && @@ -1107,11 +1205,12 @@ void SIFoldOperandsImpl::foldOperand( return; const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg); - if (!DestReg.isPhysical()) { - if (DestRC == &AMDGPU::AGPR_32RegClass && - TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { + if (!DestReg.isPhysical() && DestRC == &AMDGPU::AGPR_32RegClass) { + std::optional<int64_t> UseImmVal = OpToFold.getEffectiveImmVal(); + if (UseImmVal && TII->isInlineConstant( + *UseImmVal, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64)); - UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); + UseMI->getOperand(1).ChangeToImmediate(*UseImmVal); CopiesToReplace.push_back(UseMI); return; } @@ -1167,8 +1266,9 @@ void SIFoldOperandsImpl::foldOperand( if (UseMI->isCopy() && OpToFold.isReg() && UseMI->getOperand(0).getReg().isVirtual() && !UseMI->getOperand(1).getSubReg() && - OpToFold.getParent()->implicit_operands().empty()) { - LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI); + OpToFold.DefMI->implicit_operands().empty()) { + LLVM_DEBUG(dbgs() << "Folding " << OpToFold.OpToFold << "\n into " + << *UseMI); unsigned Size = TII->getOpSize(*UseMI, 1); Register UseReg = OpToFold.getReg(); UseMI->getOperand(1).setReg(UseReg); @@ -1208,7 +1308,7 @@ void SIFoldOperandsImpl::foldOperand( UseMI->getOperand(1).setSubReg(SubRegIdx); UseMI->getOperand(1).setIsKill(false); CopiesToReplace.push_back(UseMI); - OpToFold.setIsKill(false); + OpToFold.OpToFold->setIsKill(false); // Remove kill flags as kills may now be out of order with uses. MRI->clearKillFlags(UseReg); @@ -1228,21 +1328,21 @@ void SIFoldOperandsImpl::foldOperand( if (FoldingImmLike) { if (execMayBeModifiedBeforeUse(*MRI, UseMI->getOperand(UseOpIdx).getReg(), - *OpToFold.getParent(), - *UseMI)) + *OpToFold.DefMI, *UseMI)) return; UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32)); - if (OpToFold.isImm()) - UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); - else if (OpToFold.isFI()) - UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex()); + if (OpToFold.isImm()) { + UseMI->getOperand(1).ChangeToImmediate( + *OpToFold.getEffectiveImmVal()); + } else if (OpToFold.isFI()) + UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getFI()); else { assert(OpToFold.isGlobal()); - UseMI->getOperand(1).ChangeToGA(OpToFold.getGlobal(), - OpToFold.getOffset(), - OpToFold.getTargetFlags()); + UseMI->getOperand(1).ChangeToGA(OpToFold.OpToFold->getGlobal(), + OpToFold.OpToFold->getOffset(), + OpToFold.OpToFold->getTargetFlags()); } UseMI->removeOperand(2); // Remove exec read (or src1 for readlane) return; @@ -1251,8 +1351,7 @@ void SIFoldOperandsImpl::foldOperand( if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) { if (execMayBeModifiedBeforeUse(*MRI, UseMI->getOperand(UseOpIdx).getReg(), - *OpToFold.getParent(), - *UseMI)) + *OpToFold.DefMI, *UseMI)) return; // %vgpr = COPY %sgpr0 @@ -1294,7 +1393,7 @@ void SIFoldOperandsImpl::foldOperand( return; } - tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold); + tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold); // FIXME: We could try to change the instruction from 64-bit to 32-bit // to enable more folding opportunities. The shrink operands pass @@ -1302,32 +1401,7 @@ void SIFoldOperandsImpl::foldOperand( return; } - - const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc(); - const TargetRegisterClass *FoldRC = - TRI->getRegClass(FoldDesc.operands()[0].RegClass); - - // Split 64-bit constants into 32-bits for folding. - if (UseOp->getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) { - Register UseReg = UseOp->getReg(); - const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg); - if (AMDGPU::getRegBitWidth(*UseRC) != 64) - return; - - APInt Imm(64, OpToFold.getImm()); - if (UseOp->getSubReg() == AMDGPU::sub0) { - Imm = Imm.getLoBits(32); - } else { - assert(UseOp->getSubReg() == AMDGPU::sub1); - Imm = Imm.getHiBits(32); - } - - MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); - tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp); - return; - } - - tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold); + tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold); } static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, @@ -1606,7 +1680,7 @@ bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const { } bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, - MachineOperand &OpToFold) const { + const FoldableDef &OpToFold) const { // We need mutate the operands of new mov instructions to add implicit // uses of EXEC, but adding them invalidates the use_iterator, so defer // this. @@ -1637,7 +1711,9 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, llvm::make_pointer_range(MRI->use_nodbg_operands(Dst.getReg()))); for (auto *U : UsesToProcess) { MachineInstr *UseMI = U->getParent(); - foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList, + + FoldableDef SubOpToFold = OpToFold.getWithSubReg(*TRI, U->getSubReg()); + foldOperand(SubOpToFold, UseMI, UseMI->getOperandNo(U), FoldList, CopiesToReplace); } @@ -1650,10 +1726,10 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, Copy->addImplicitDefUseOperands(*MF); for (FoldCandidate &Fold : FoldList) { - assert(!Fold.isReg() || Fold.OpToFold); - if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) { - Register Reg = Fold.OpToFold->getReg(); - MachineInstr *DefMI = Fold.OpToFold->getParent(); + assert(!Fold.isReg() || Fold.Def.OpToFold); + if (Fold.isReg() && Fold.getReg().isVirtual()) { + Register Reg = Fold.getReg(); + const MachineInstr *DefMI = Fold.Def.DefMI; if (DefMI->readsRegister(AMDGPU::EXEC, TRI) && execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI)) continue; @@ -1661,11 +1737,11 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, if (updateOperand(Fold)) { // Clear kill flags. if (Fold.isReg()) { - assert(Fold.OpToFold && Fold.OpToFold->isReg()); + assert(Fold.Def.OpToFold && Fold.isReg()); // FIXME: Probably shouldn't bother trying to fold if not an // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR // copies. - MRI->clearKillFlags(Fold.OpToFold->getReg()); + MRI->clearKillFlags(Fold.getReg()); } LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << static_cast<int>(Fold.UseOpNo) << " of " @@ -1847,6 +1923,18 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy( if (OpToFold.isReg() && !OpToFold.getReg().isVirtual()) return false; + // Prevent folding operands backwards in the function. For example, + // the COPY opcode must not be replaced by 1 in this example: + // + // %3 = COPY %vgpr0; VGPR_32:%3 + // ... + // %vgpr0 = V_MOV_B32_e32 1, implicit %exec + if (!DstReg.isVirtual()) + return false; + + const TargetRegisterClass *DstRC = + MRI->getRegClass(MI.getOperand(0).getReg()); + // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt // Can remove this code if proper 16-bit SGPRs are implemented // Example: Pre-peephole-opt @@ -1863,8 +1951,6 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy( // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32 if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() && OpToFold.getSubReg()) { - const TargetRegisterClass *DstRC = - MRI->getRegClass(MI.getOperand(0).getReg()); if (DstRC == &AMDGPU::SReg_32RegClass && DstRC == MRI->getRegClass(OpToFold.getReg())) { assert(OpToFold.getSubReg() == AMDGPU::lo16); @@ -1872,15 +1958,6 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy( } } - // Prevent folding operands backwards in the function. For example, - // the COPY opcode must not be replaced by 1 in this example: - // - // %3 = COPY %vgpr0; VGPR_32:%3 - // ... - // %vgpr0 = V_MOV_B32_e32 1, implicit %exec - if (!DstReg.isVirtual()) - return false; - if (OpToFold.isReg() && foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI)) return true; @@ -1892,7 +1969,8 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy( return true; } - bool Changed = foldInstOperand(MI, OpToFold); + FoldableDef Def(OpToFold, DstRC); + bool Changed = foldInstOperand(MI, Def); // If we managed to fold all uses of this copy then we might as well // delete it now. diff --git a/llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll b/llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll index 7d1ea68e63241..f18a657b8082d 100644 --- a/llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll +++ b/llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll @@ -16,6 +16,7 @@ define amdgpu_kernel void @stepper_test_kernel_DType_I6A6AcB6A6AsA6A6A_68a5362b9 ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX942-NEXT: s_mov_b32 s8, 0x47004600 ; GFX942-NEXT: s_mov_b32 s9, 0x45004400 +; GFX942-NEXT: s_mov_b32 s10, 0x42004000 ; GFX942-NEXT: s_mov_b64 s[4:5], 0 ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] @@ -29,7 +30,7 @@ define amdgpu_kernel void @stepper_test_kernel_DType_I6A6AcB6A6AsA6A6A_68a5362b9 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_pk_add_f16 v7, v7, s8 ; GFX942-NEXT: v_pk_add_f16 v6, v6, s9 -; GFX942-NEXT: v_pk_add_f16 v5, v5, 0 +; GFX942-NEXT: v_pk_add_f16 v5, v5, s10 ; GFX942-NEXT: v_pk_add_f16 v4, v4, 1.0 op_sel:[0,1] op_sel_hi:[1,0] ; GFX942-NEXT: global_store_dwordx4 v2, v[4:7], s[0:1] ; GFX942-NEXT: s_add_u32 s0, s0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.gfx942.mir b/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.gfx942.mir index c1ce216955dca..98b179fb5f2a8 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.gfx942.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.gfx942.mir @@ -18,7 +18,7 @@ body: | ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1107312640 ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1006632960 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 - ; CHECK-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[COPY]], 8, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[COPY]], 8, [[REG_SEQUENCE]].sub1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F16_]] %0:vgpr_32 = COPY $vgpr0 %1:sreg_32 = S_MOV_B32 1107312640 @@ -99,7 +99,7 @@ body: | ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -16 ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1006632960 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 - ; CHECK-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[COPY]], 8, 4294967295, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[COPY]], 8, -16, 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F16_]] %0:vgpr_32 = COPY $vgpr0 %1:sreg_32 = S_MOV_B32 -16 @@ -123,7 +123,7 @@ body: | ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 15360 ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1006632960 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 - ; CHECK-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[COPY]], 8, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[COPY]], 8, 15360, 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F16_]] %0:vgpr_32 = COPY $vgpr0 %1:sreg_32 = S_MOV_B32 15360 @@ -147,7 +147,7 @@ body: | ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1006632960 ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 15360 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 - ; CHECK-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[COPY]], 8, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[COPY]], 4, 15360, 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F16_]] %0:vgpr_32 = COPY $vgpr0 %1:sreg_32 = S_MOV_B32 1006632960 diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir b/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir index 38b4533a14895..1742349673d7c 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir @@ -115,8 +115,8 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 8 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B64_]].sub1, %subreg.sub0, [[S_MOV_B64_]].sub0, %subreg.sub1 - ; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]].sub0, 8, implicit-def $scc - ; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY]].sub1, 0, implicit-def $scc, implicit $scc + ; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]].sub0, 0, implicit-def $scc + ; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY]].sub1, 8, implicit-def $scc, implicit $scc ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_ADD_U32_]], implicit [[S_ADDC_U32_]] %0:sgpr_64 = COPY $sgpr8_sgpr9 %1:sreg_64 = S_MOV_B64 8 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits