https://github.com/hidekisaito updated https://github.com/llvm/llvm-project/pull/190723
>From 61f04b9c88eb92e00cc57f92dcbbeb4a292d4074 Mon Sep 17 00:00:00 2001 From: Hideki Saito <[email protected]> Date: Mon, 6 Apr 2026 16:11:39 -0500 Subject: [PATCH] iglp_opt(4): Unpack V_PK F32 and then evenly space MFMAs and VALUs --- clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 6 +- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 10 +- llvm/lib/IR/Verifier.cpp | 10 + llvm/lib/Target/AMDGPU/AMDGPU.h | 3 + llvm/lib/Target/AMDGPU/AMDGPUIGroupFixup.cpp | 1205 +++++++++++++++++ llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 93 +- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h | 21 + llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 12 + llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 + llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 26 + .../iglp-opt4-mfma-valu-spacing-scheduler.ll | 72 + .../iglp-opt4-mfma-valu-spacing-scheduler.mir | 35 + .../AMDGPU/iglp-unpack-mfma-vpk-gfx942.mir | 40 + .../CodeGen/AMDGPU/iglp-unpack-mfma-vpk.ll | 59 + llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 2 + llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 4 + .../secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 1 + 18 files changed, 1583 insertions(+), 18 deletions(-) create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUIGroupFixup.cpp create mode 100644 llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.ll create mode 100644 llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.mir create mode 100644 llvm/test/CodeGen/AMDGPU/iglp-unpack-mfma-vpk-gfx942.mir create mode 100644 llvm/test/CodeGen/AMDGPU/iglp-unpack-mfma-vpk.ll diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 0fc40d396a87d..cd4cb103291dd 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -980,14 +980,16 @@ void test_sched_group_barrier() // CHECK-LABEL: @test_iglp_opt // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 0) // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 1) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 2) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 3) // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 4) -// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 15) void test_iglp_opt() { __builtin_amdgcn_iglp_opt(0); __builtin_amdgcn_iglp_opt(1); + __builtin_amdgcn_iglp_opt(2); + __builtin_amdgcn_iglp_opt(3); __builtin_amdgcn_iglp_opt(4); - __builtin_amdgcn_iglp_opt(15); } // CHECK-LABEL: @test_s_sleep diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index f576972183eca..ebe48704bca4f 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -377,8 +377,14 @@ def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_ [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; -// Scheduler optimization hint. -// MASK = 0: Small gemm opt +// Scheduler optimization hint: immediate selects AMDGPU::IGLPStrategyID (see +// AMDGPUIGroupLP.h). Must be a constant in [0,4]. Mappings: +// 0 - MFMA small-GEMM scheduling (MFMASmallGemmOpt). +// 1 - MFMA small-GEMM single-wave variant (MFMASmallGemmSingleWaveOpt). +// 2 - MFMAExpInterleaveOpt (TRANS/MFMA scheduling pipeline; see AMDGPUIGroupLP.cpp). +// 3 - MFMAExpSimpleInterleaveOpt (simpler TRANS-then-MFMA interleave pattern). +// 4 - MFMA (or WMMA) / VALU spacing (MFMAValuSpacingOpt); also gates +// AMDGPUIGLPUnpack V_PK unpack when that pass is enabled. def int_amdgcn_iglp_opt : ClangBuiltin<"__builtin_amdgcn_iglp_opt">, Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 7d5dc91a1504e..6434298fe9062 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -7113,6 +7113,16 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { "llvm.amdgcn.s.prefetch.data only supports global or constant memory"); break; } + case Intrinsic::amdgcn_iglp_opt: { + const auto *Mask = dyn_cast<ConstantInt>(Call.getArgOperand(0)); + Check(Mask, "llvm.amdgcn.iglp.opt requires a constant mask argument", &Call, + Call.getArgOperand(0)); + const int64_t V = Mask->getSExtValue(); + Check(V >= 0 && V <= 4, + "llvm.amdgcn.iglp.opt mask must be in the range [0,4]", &Call, + Call.getArgOperand(0)); + break; + } case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4: case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: { Value *Src0 = Call.getArgOperand(0); diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 878f374110159..acbc19b43d32d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -559,6 +559,9 @@ extern char &GCNNSAReassignID; void initializeGCNPreRALongBranchRegLegacyPass(PassRegistry &); extern char &GCNPreRALongBranchRegID; +void initializeAMDGPUIGLPUnpackLegacyPass(PassRegistry &); +extern char &AMDGPUIGLPUnpackID; + void initializeGCNPreRAOptimizationsLegacyPass(PassRegistry &); extern char &GCNPreRAOptimizationsID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupFixup.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupFixup.cpp new file mode 100644 index 0000000000000..7b370a3a7aaa9 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupFixup.cpp @@ -0,0 +1,1205 @@ +//===-- AMDGPUIGLPUnpack.cpp - AMDGPU IGLP unpack MIR cleanup -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Machine IR cleanup pass that runs after rename-independent-subregs and +// before the pre-RA machine scheduler. +// +// Schedule regions (for IGLP / packed-op analysis) are delimited by machine +// basic block boundaries and by scheduling boundaries as defined by +// SIInstrInfo::isSchedulingBoundary (e.g. SCHED_BARRIER with mask 0). +// +// V_PK * F32 unpacking lowers each packed op to two scalar VALUs that define +// the destination register by subregs (e.g. vreg_64: undef %dst.sub0 = …; +// %dst.sub1 = …), matching how wide accumulators are updated in real kernels +// (full wide def, then per-lane subreg writes — see incoming FMHA MIR). A +// single V_PK full-reg def leaves IsSSA set on the function; multiple subreg +// defs require clearing IsSSA. Subreg indices follow the V_PK dest (sub0/sub1 +// for vreg_64; composite 64-bit lanes e.g. sub0_sub1 … sub6_sub7 on wide +// vectors via composeSubRegIndices). Unpacked sources use the same compose +// pattern as SIPreEmitPeephole (operand packed subreg composed with sub0/sub1 +// for the scalar lane). Undef on the first unpacked def only when the +// destination vreg has no prior def in the block and the destination is not +// also a packed source (otherwise lanes already hold defined values). Emit low +// lane, then high lane (first BuildMI before V_PK is earliest in the block). +// +// COPY cleanup: when a packed source vreg is only used by this V_PK and is +// populated solely by COPYs from another register (e.g. assembling a vreg_64 +// from two lanes of a wide vector), fold those COPY sources into the unpacked +// VALUs and erase the now-dead COPY defs. +// +// Same-vreg subreg COPY (e.g. %v.sub1 = COPY %v.sub0): when every read of the +// destination lane is an explicit DstSub use after the COPY, rewrite those to +// SrcSub and remove the COPY. Full-register uses cannot be folded. +// +// Sink: when the COPY cannot be removed, move it down in the same MBB to +// immediately before the first instruction that reads the destination lane +// (or immediately before the first def that blocks sinking past it). +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUIGroupLP.h" +#include "GCNSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Register.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/IR/Analysis.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include <algorithm> +#include <iterator> +#include <limits> + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-iglp-unpack" + +/// Max number of V_PK unpack attempts per schedule sub-region (program order +/// within the region). 0 means unlimited. Resets for each sub-region. Does not +/// affect --amdgpu-enable-iglp-unpack=false (full bypass). +static cl::opt<unsigned> AMDGPUIGLPUnpackMaxVPKPerRegion( + "amdgpu-iglp-unpack-max-vpk-per-region", + cl::desc("Unpack at most this many V_PK instructions per schedule region " + "(0 = unlimited)"), + cl::init(0), cl::Hidden); + +/// After V_PK unpacking, optionally split 64-bit VGPR/AGPR vregs that are only +/// referenced via \p sub0 / \p sub1 into 32-bit vregs. 0 skips this cleanup. +/// Default is UINT_MAX (all eligible vregs). Unlike the V_PK-per-region cap, +/// 0 does not mean unlimited. +static cl::opt<unsigned> AMDGPUIGLPUnpackPostCleanupMaxVRegs( + "amdgpu-iglp-unpack-post-cleanup-max-vregs", + cl::desc("Post-unpack: split at most this many 64-bit VGPR/AGPR registers " + "to 32-bit (0 = skip; default = all)"), + cl::init(std::numeric_limits<unsigned>::max()), cl::Hidden); + +// === AMDGPUIGLP_UNPACK_POLICY ============================================== +// +// kRequireMFMAValuSpacingIGLPOptForRegion +// When true (default): only schedule sub-regions that contain IGLP_OPT with +// immediate == MFMAValuSpacingOptID are candidates for V_PK unpack. That +// pseudo must survive for the pre-RA scheduler (IGroupLP) to apply +// MFMAValuSpacingOpt. +// When false: any sub-region with at least one V_PK is a candidate; IGLP_OPT +// is not required. +// +static constexpr bool kRequireMFMAValuSpacingIGLPOptForRegion = true; +// ============================================================================ + +namespace { + +/// A schedule sub-region (between scheduling boundaries) that has at least one +/// V_PK, and when kRequireMFMAValuSpacingIGLPOptForRegion also an IGLP_OPT with +/// immediate MFMAValuSpacingOptID, for unpack / cleanup. +struct CandidateRegion { + MachineBasicBlock *MBB = nullptr; + MachineBasicBlock::iterator Begin; + MachineBasicBlock::iterator End; + SmallVector<MachineInstr *, 8> VPKInsts; +}; + +static bool isVPKOpcode(const MCInstrInfo &II, unsigned Opc) { + return II.getName(Opc).starts_with("V_PK"); +} + +/// Log why a single V_PK was not unpacked. Enable with +/// -debug-only=amdgpu-iglp-unpack. +static void debugSkipVPKUnpack(const MachineInstr &MI, const SIInstrInfo *TII, + const TargetRegisterInfo *TRI, + const char *Reason, unsigned LoSeqIdx = 0, + unsigned HiSeqIdx = 0, bool LoRCValid = true, + bool HiRCValid = true) { + LLVM_DEBUG({ + dbgs() << DEBUG_TYPE << ": skip unpack (" << Reason << ")"; + if (MI.getNumOperands() > 0 && MI.getOperand(0).isReg()) { + dbgs() << " op=" << TII->getName(MI.getOpcode()) << " dst=" + << printReg(MI.getOperand(0).getReg(), TRI, + MI.getOperand(0).getSubReg()); + } + if (LoSeqIdx && TRI) { + if (const char *N = TRI->getSubRegIndexName(LoSeqIdx)) + dbgs() << " loSub=" << N; + else + dbgs() << " loSubIdx=" << LoSeqIdx; + if (!LoRCValid) + dbgs() << "[!validRC]"; + } + if (HiSeqIdx && TRI) { + if (const char *N = TRI->getSubRegIndexName(HiSeqIdx)) + dbgs() << " hiSub=" << N; + else + dbgs() << " hiSubIdx=" << HiSeqIdx; + if (!HiRCValid) + dbgs() << "[!validRC]"; + } + dbgs() << "\n"; + }); +} + +static void splitMBBBySchedBarriers( + MachineBasicBlock &MBB, const SIInstrInfo &TII, + SmallVectorImpl<std::pair<MachineBasicBlock::iterator, + MachineBasicBlock::iterator>> &Regions) { + Regions.clear(); + MachineFunction &MF = *MBB.getParent(); + MachineBasicBlock::iterator Start = MBB.begin(); + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; + ++I) { + if (TII.isSchedulingBoundary(*I, &MBB, MF)) { + Regions.push_back({Start, I}); + Start = std::next(I); + } + } + Regions.push_back({Start, MBB.end()}); +} + +static bool findCandidateRegion(MachineBasicBlock &MBB, + MachineBasicBlock::iterator RegionBegin, + MachineBasicBlock::iterator RegionEnd, + const MCInstrInfo &II, CandidateRegion &Out) { + Out = CandidateRegion{}; + bool HasMFMAValuSpacingIGLP = false; + + for (MachineBasicBlock::iterator It = RegionBegin; It != RegionEnd; ++It) { + MachineInstr &MI = *It; + if (MI.getOpcode() == AMDGPU::IGLP_OPT && MI.getNumOperands() >= 1 && + MI.getOperand(0).isImm() && + MI.getOperand(0).getImm() == + static_cast<int64_t>(AMDGPU::IGLPStrategyID::MFMAValuSpacingOptID)) + HasMFMAValuSpacingIGLP = true; + + if (isVPKOpcode(II, MI.getOpcode())) + Out.VPKInsts.push_back(&MI); + } + + if (Out.VPKInsts.empty()) + return false; + if (kRequireMFMAValuSpacingIGLPOptForRegion && !HasMFMAValuSpacingIGLP) + return false; + + Out.MBB = &MBB; + Out.Begin = RegionBegin; + Out.End = RegionEnd; + return true; +} + +// --- F32 unpack (aligned with SIPreEmitPeephole) ---------------------------- + +/// Skip unpacking for a schedule sub-region if any instruction has an explicit +/// operand using an allocatable physical register (pre-RA MIR is virtual; phys +/// operands indicate an unusual/late state we do not transform). +static bool +schedRegionHasExplicitAllocatablePhysReg(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + const MachineRegisterInfo &MRI) { + for (MachineBasicBlock::iterator It = Begin; It != End; ++It) { + for (const MachineOperand &MO : It->operands()) { + if (!MO.isReg() || !MO.getReg().isPhysical()) + continue; + if (MO.isImplicit()) + continue; + if (!MRI.isAllocatable(MO.getReg())) + continue; + return true; + } + } + return false; +} + +static uint32_t mapToUnpackedOpcode(const MachineInstr &I) { + switch (I.getOpcode()) { + case AMDGPU::V_PK_ADD_F32: + case AMDGPU::V_PK_ADD_F32_gfx12: + return AMDGPU::V_ADD_F32_e64; + case AMDGPU::V_PK_MUL_F32: + case AMDGPU::V_PK_MUL_F32_gfx12: + return AMDGPU::V_MUL_F32_e64; + case AMDGPU::V_PK_FMA_F32: + case AMDGPU::V_PK_FMA_F32_gfx12: + return AMDGPU::V_FMA_F32_e64; + default: + return std::numeric_limits<uint32_t>::max(); + } +} + +static bool canUnpackingClobberRegister(const MachineInstr &MI, + const SIInstrInfo *TII, + const SIRegisterInfo *TRI) { + Register DstReg = MI.getOperand(0).getReg(); + // Virtual unpack lowers to subreg VALU defs; the classic post-RA clobber case + // only applies to physical destinations. + if (DstReg.isVirtual()) + return false; + + unsigned OpCode = MI.getOpcode(); + Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0); + + const MachineOperand *Src0MO = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + if (Src0MO && Src0MO->isReg()) { + Register SrcReg0 = Src0MO->getReg(); + unsigned Src0Mods = + TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm(); + Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1) + ? TRI->getSubReg(SrcReg0, AMDGPU::sub1) + : TRI->getSubReg(SrcReg0, AMDGPU::sub0); + if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg)) + return true; + } + + const MachineOperand *Src1MO = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src1MO && Src1MO->isReg()) { + Register SrcReg1 = Src1MO->getReg(); + unsigned Src1Mods = + TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm(); + Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1) + ? TRI->getSubReg(SrcReg1, AMDGPU::sub1) + : TRI->getSubReg(SrcReg1, AMDGPU::sub0); + if (TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg)) + return true; + } + + if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { + const MachineOperand *Src2MO = + TII->getNamedOperand(MI, AMDGPU::OpName::src2); + if (Src2MO && Src2MO->isReg()) { + Register SrcReg2 = Src2MO->getReg(); + unsigned Src2Mods = + TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm(); + Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1) + ? TRI->getSubReg(SrcReg2, AMDGPU::sub1) + : TRI->getSubReg(SrcReg2, AMDGPU::sub0); + if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg)) + return true; + } + } + return false; +} + +/// True if any packed source uses the same vreg as the destination +/// (dst-as-src). +static bool vpkAnySrcUsesDst(const MachineInstr &MI, const SIInstrInfo *TII, + Register DstReg) { + unsigned Opc = MI.getOpcode(); + const MachineOperand *S0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + if (S0 && S0->isReg() && S0->getReg() == DstReg) + return true; + const MachineOperand *S1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (S1 && S1->isReg() && S1->getReg() == DstReg) + return true; + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src2)) { + const MachineOperand *S2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + if (S2 && S2->isReg() && S2->getReg() == DstReg) + return true; + } + return false; +} + +/// True if Reg has an explicit def in the same MBB strictly before I. +static bool hasExplicitDefOfRegBefore(const MachineInstr &I, Register Reg) { + const MachineBasicBlock *MBB = I.getParent(); + for (MachineBasicBlock::const_iterator It = MBB->begin(), E = I.getIterator(); + It != E; ++It) { + for (const MachineOperand &MO : It->operands()) { + if (MO.isReg() && MO.isDef() && MO.getReg() == Reg) + return true; + } + } + return false; +} + +/// True if every non-debug use of R is on \p VPK (same instruction may use R +/// more than once, e.g. src0 and src1). +static bool isVirtualSrcRegOnlyUsedByThisVPK(Register R, MachineInstr &VPK, + MachineRegisterInfo &MRI) { + if (!R.isVirtual()) + return false; + if (MRI.use_nodbg_empty(R)) + return false; + for (MachineInstr &U : MRI.use_nodbg_instructions(R)) { + if (&U != &VPK) + return false; + } + return true; +} + +/// If a COPY in the same MBB strictly before VPK defines SrcReg with subreg +/// NeedSubIdx from a register source, return true and set OutReg/OutSub. +static bool tryFoldThroughSameBlockCopyDef(const MachineInstr &VPK, + Register SrcReg, unsigned NeedSubIdx, + Register &OutReg, unsigned &OutSub) { + const MachineBasicBlock *MBB = VPK.getParent(); + for (MachineBasicBlock::const_iterator It = MBB->begin(), + E = VPK.getIterator(); + It != E; ++It) { + if (It->getOpcode() != AMDGPU::COPY) + continue; + const MachineOperand &DefMO = It->getOperand(0); + if (!DefMO.isReg() || !DefMO.isDef() || DefMO.getReg() != SrcReg) + continue; + if (DefMO.getSubReg() != NeedSubIdx) + continue; + const MachineOperand &SrcMO = It->getOperand(1); + if (!SrcMO.isReg()) + return false; + OutReg = SrcMO.getReg(); + OutSub = SrcMO.getSubReg(); + return true; + } + return false; +} + +static void addOperandAndMods(MachineInstrBuilder &NewMI, unsigned SrcMods, + bool IsHiBits, const MachineOperand &SrcMO, + Register DstReg, Register LaneSrcBase, + unsigned DstPackSub, MachineInstr &VPK, + const SIRegisterInfo *TRI) { + unsigned NewSrcMods = 0; + unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG; + unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0; + if (SrcMods & NegModifier) + NewSrcMods |= SISrcMods::NEG; + NewMI.addImm(NewSrcMods); + if (SrcMO.isImm()) { + NewMI.addImm(SrcMO.getImm()); + return; + } + Register OrigSrcReg = SrcMO.getReg(); + unsigned SrcPackSub = SrcMO.getSubReg(); + + const bool UseHiOfPair = (SrcMods & OpSelModifier) != 0; + const unsigned PairLane = UseHiOfPair ? AMDGPU::sub1 : AMDGPU::sub0; + + // Packed subreg on the operand (e.g. sub6_sub7); when src is the same + // super-register as dst (dst-as-src), MO may omit the subreg — use the V_PK + // destination packed subreg. + unsigned BasePack = SrcPackSub; + if (!BasePack && OrigSrcReg == DstReg) + BasePack = DstPackSub; + + unsigned FinalSubIdx = + BasePack ? TRI->composeSubRegIndices(BasePack, PairLane) : PairLane; + + Register SrcReg = OrigSrcReg; + if (OrigSrcReg.isVirtual() && OrigSrcReg != DstReg) { + Register FoldReg; + unsigned FoldSub = 0; + if (tryFoldThroughSameBlockCopyDef(VPK, OrigSrcReg, FinalSubIdx, FoldReg, + FoldSub) && + FoldSub) { + // COPY source subreg is the exact read for this packed lane; use it + // directly for scalar lanes (<=32b). Wider composed subregs need + // compose(FoldSub, PairLane). + SrcReg = FoldReg; + const unsigned Sz = TRI->getSubRegIdxSize(FoldSub); + if (Sz <= 32) + FinalSubIdx = FoldSub; + else if (unsigned Composed = TRI->composeSubRegIndices(FoldSub, PairLane)) + FinalSubIdx = Composed; + else + FinalSubIdx = FoldSub; + } + } + + if (OrigSrcReg == DstReg) + SrcReg = LaneSrcBase; + + bool KillState = false; + if (SrcMO.isKill()) { + bool OpSel = SrcMods & SISrcMods::OP_SEL_0; + bool OpSelHi = SrcMods & SISrcMods::OP_SEL_1; + KillState = true; + if ((OpSel == OpSelHi) && !IsHiBits) + KillState = false; + } + if (SrcReg.isPhysical()) { + Register Phys = TRI->getSubReg(SrcReg, FinalSubIdx); + if (KillState) + NewMI.addReg(Phys, RegState::Kill); + else + NewMI.addReg(Phys); + } else { + if (KillState) + NewMI.addReg(SrcReg, RegState::Kill, FinalSubIdx); + else + NewMI.addReg(SrcReg, {}, FinalSubIdx); + } +} + +static MachineInstrBuilder +createUnpackedMI(MachineInstr &I, const SIInstrInfo *TII, + const SIRegisterInfo *TRI, uint32_t UnpackedOpcode, + bool IsHiBits, Register DstReg, unsigned DefSubIdx, + Register LaneSrcBase, unsigned DstPackSub, bool UndefOnDef) { + MachineBasicBlock &MBB = *I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0); + const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1); + unsigned OpCode = I.getOpcode(); + + int64_t ClampVal = TII->getNamedOperand(I, AMDGPU::OpName::clamp)->getImm(); + unsigned Src0Mods = + TII->getNamedOperand(I, AMDGPU::OpName::src0_modifiers)->getImm(); + unsigned Src1Mods = + TII->getNamedOperand(I, AMDGPU::OpName::src1_modifiers)->getImm(); + + MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); + NewMI.addDef(DstReg, RegState::Define, DefSubIdx); + if (UndefOnDef) + NewMI->getOperand(0).setIsUndef(true); + addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0, DstReg, LaneSrcBase, + DstPackSub, I, TRI); + addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1, DstReg, LaneSrcBase, + DstPackSub, I, TRI); + + if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { + const MachineOperand *SrcMO2 = + TII->getNamedOperand(I, AMDGPU::OpName::src2); + unsigned Src2Mods = + TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm(); + addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2, DstReg, LaneSrcBase, + DstPackSub, I, TRI); + } + NewMI.addImm(ClampVal); + NewMI.addImm(0); + return NewMI; +} + +static void +recomputeIntervalsAfterVirtualUnpack(ArrayRef<MachineInstr *> MIs, + LiveIntervals &LIS, + ArrayRef<Register> ExtraRegs = {}) { + SmallVector<Register, 16> Regs; + for (MachineInstr *MI : MIs) { + if (!MI) + continue; + for (MachineOperand &MO : MI->operands()) { + if (MO.isReg() && MO.getReg().isVirtual()) + Regs.push_back(MO.getReg()); + } + } + Regs.append(ExtraRegs.begin(), ExtraRegs.end()); + llvm::sort(Regs); + Regs.erase(llvm::unique(Regs), Regs.end()); + for (Register R : Regs) { + if (LIS.hasInterval(R)) + LIS.removeInterval(R); + LIS.createAndComputeVirtRegInterval(R); + } +} + +/// Erase COPY defs of \p R in \p MBB when \p R has no non-debug uses (e.g. temp +/// only fed this V_PK). Collect virtual regs touched for LIS recomputation. +static void eraseRedundantCopyDefsForRegIfUnused( + Register R, MachineBasicBlock &MBB, MachineRegisterInfo &MRI, + LiveIntervals &LIS, SmallVectorImpl<Register> &RegsToRecompute) { + if (!R.isVirtual() || !MRI.use_nodbg_empty(R)) + return; + + SmallVector<MachineInstr *, 8> ToErase; + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() != AMDGPU::COPY) + continue; + MachineOperand &Def = MI.getOperand(0); + if (!Def.isReg() || !Def.isDef() || Def.getReg() != R) + continue; + ToErase.push_back(&MI); + } + for (MachineInstr *MI : ToErase) { + for (MachineOperand &MO : MI->operands()) { + if (MO.isReg() && MO.getReg().isVirtual()) + RegsToRecompute.push_back(MO.getReg()); + } + LIS.RemoveMachineInstrFromMaps(*MI); + MI->eraseFromParent(); + } +} + +/// True iff \p A is ordered before \p B in the same block (linear scan). +static bool instrIsBeforeInSameBB(const MachineInstr *A, + const MachineInstr *B) { + assert(A->getParent() == B->getParent()); + for (const MachineInstr &MI : *A->getParent()) { + if (&MI == A) + return true; + if (&MI == B) + return false; + } + llvm_unreachable("instructions not in same block"); +} + +static LaneBitmask laneMaskForRegOperand(const MachineOperand &MO, Register R, + const MachineRegisterInfo &MRI, + const SIRegisterInfo &TRI) { + if (!MO.isReg() || MO.getReg() != R) + return LaneBitmask::getNone(); + const TargetRegisterClass *RC = MRI.getRegClass(R); + unsigned Sub = MO.getSubReg(); + if (Sub) + return TRI.getSubRegIndexLaneMask(Sub); + // No subreg: operand names the full virtual register. RC->getLaneMask() can + // equal a single 32-bit lane for 64-bit vreg_64 classes; OR in the paired + // lane so full-reg reads overlap sub1 when querying sub1's mask. + LaneBitmask M = RC->getLaneMask(); + if (TRI.getRegSizeInBits(*RC) == 64) { + LaneBitmask S0 = TRI.getSubRegIndexLaneMask(AMDGPU::sub0); + LaneBitmask S1 = TRI.getSubRegIndexLaneMask(AMDGPU::sub1); + if (M == S0 || M == S1) + return S0 | S1; + } + return M; +} + +/// True if \p Blocker defines \p R on a lane that overlaps the COPY source or +/// (non-COPY) destination lane, so the same-vreg COPY cannot be sunk past it. +static bool instrBlocksSinkOfSameVRegCopy(const MachineInstr &Blocker, + const MachineInstr &CopyMI, + Register R, LaneBitmask SMask, + LaneBitmask DMask, + const MachineRegisterInfo &MRI, + const SIRegisterInfo &TRI) { + for (const MachineOperand &MO : Blocker.all_defs()) { + if (!MO.isReg() || MO.getReg() != R) + continue; + unsigned Sub = MO.getSubReg(); + const TargetRegisterClass *RC = MRI.getRegClass(R); + LaneBitmask FullMask = RC->getLaneMask(); + LaneBitmask DefM = Sub ? TRI.getSubRegIndexLaneMask(Sub) : FullMask; + if ((DefM & SMask).any()) + return true; + if ((DefM & DMask).any() && &Blocker != &CopyMI) + return true; + } + return false; +} + +/// First instruction in the same MBB strictly after \p CopyMI in program order, +/// before \p SearchEnd (exclusive), that has an operand on \p R touching a +/// lane overlapping \p DMask. +static MachineInstr *findFirstDependentUseAfterSameVRegCopy( + MachineInstr &CopyMI, Register R, LaneBitmask DMask, + const MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, + MachineBasicBlock::iterator SearchEnd) { + for (auto It = std::next(CopyMI.getIterator()); It != SearchEnd; ++It) { + MachineInstr &MI = *It; + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || MO.getReg() != R || MO.isDebug()) + continue; + // Ignore operands that do not read the register (e.g. S_NOP may carry + // reg operands that are not real uses for liveness). + if (!MO.readsReg()) + continue; + LaneBitmask M = laneMaskForRegOperand(MO, R, MRI, TRI); + if ((M & DMask).none()) + continue; + return &MI; + } + } + return nullptr; +} + +/// Sink COPY %R:DstSub = %R:SrcSub toward its first dependent use, without +/// crossing \p RegionEnd (the exclusive end of the current schedule region — +/// e.g. the next scheduling boundary). If the first dependent use is in a later +/// region, sink to immediately before \p RegionEnd instead. +static bool trySinkSameVRegSubregCopy(MachineInstr &CopyMI, + MachineRegisterInfo &MRI, + LiveIntervals &LIS, + const SIRegisterInfo &TRI, + MachineBasicBlock::iterator RegionEnd) { + if (CopyMI.getOpcode() != AMDGPU::COPY || CopyMI.getNumOperands() < 2) + return false; + if (CopyMI.isBundled()) + return false; + + MachineOperand &DefMO = CopyMI.getOperand(0); + MachineOperand &SrcMO = CopyMI.getOperand(1); + if (!DefMO.isReg() || !DefMO.isDef() || !SrcMO.isReg()) + return false; + if (!SrcMO.readsReg()) + return false; + Register R = DefMO.getReg(); + if (R != SrcMO.getReg() || !R.isVirtual()) + return false; + unsigned DSub = DefMO.getSubReg(); + unsigned SSub = SrcMO.getSubReg(); + if (!DSub || !SSub || DSub == SSub) + return false; + + LaneBitmask DMask = TRI.getSubRegIndexLaneMask(DSub); + LaneBitmask SMask = TRI.getSubRegIndexLaneMask(SSub); + + MachineBasicBlock *MBB = CopyMI.getParent(); + MachineBasicBlock::iterator CopyIt = CopyMI.getIterator(); + + if (std::next(CopyIt) == RegionEnd) + return false; + + MachineInstr *FirstUseInRegion = findFirstDependentUseAfterSameVRegCopy( + CopyMI, R, DMask, MRI, TRI, RegionEnd); + + MachineBasicBlock::iterator InsertPt; + + if (FirstUseInRegion) { + MachineBasicBlock::iterator FirstUseIt = FirstUseInRegion->getIterator(); + InsertPt = FirstUseIt; + for (MachineBasicBlock::iterator It = std::next(CopyIt); It != FirstUseIt; + ++It) { + if (instrBlocksSinkOfSameVRegCopy(*It, CopyMI, R, SMask, DMask, MRI, + TRI)) { + InsertPt = It; + break; + } + } + } else { + // Dependent use is only past this schedule region — sink to the region + // end (before RegionEnd), not to the first use in the next region. + if (RegionEnd == MBB->end()) + return false; + MachineInstr *LaterUse = findFirstDependentUseAfterSameVRegCopy( + CopyMI, R, DMask, MRI, TRI, MBB->end()); + if (!LaterUse) + return false; + InsertPt = RegionEnd; + for (MachineBasicBlock::iterator It = std::next(CopyIt); It != RegionEnd; + ++It) { + if (instrBlocksSinkOfSameVRegCopy(*It, CopyMI, R, SMask, DMask, MRI, + TRI)) { + InsertPt = It; + break; + } + } + } + + if (InsertPt == std::next(CopyIt)) + return false; + + CopyMI.moveBefore(&*InsertPt); + LIS.handleMove(CopyMI); + return true; +} + +/// COPY DstReg:DstSub = SrcReg:SrcSub with DstReg==SrcReg (virtual): redirect +/// explicit uses of DstSub to SrcSub and remove the COPY. +/// +/// Does not remove the COPY when any use reads the destination lane without an +/// explicit DstSub (e.g. full-register %v:vreg_64) — those operands cannot be +/// rewritten to SrcSub without changing semantics (sub1 would become undef). +static bool tryFoldSameVRegSubregCopy(MachineInstr &CopyMI, + MachineRegisterInfo &MRI, + LiveIntervals &LIS, + const SIRegisterInfo &TRI) { + if (CopyMI.getOpcode() != AMDGPU::COPY || CopyMI.getNumOperands() < 2) + return false; + MachineOperand &DefMO = CopyMI.getOperand(0); + MachineOperand &SrcMO = CopyMI.getOperand(1); + if (!DefMO.isReg() || !DefMO.isDef() || !SrcMO.isReg()) + return false; + if (!SrcMO.readsReg()) + return false; + Register R = DefMO.getReg(); + if (R != SrcMO.getReg() || !R.isVirtual()) + return false; + unsigned DSub = DefMO.getSubReg(); + unsigned SSub = SrcMO.getSubReg(); + if (!DSub || !SSub || DSub == SSub) + return false; + + LaneBitmask DMask = TRI.getSubRegIndexLaneMask(DSub); + + SmallVector<MachineOperand *, 16> UseOps; + for (MachineOperand &MO : MRI.use_nodbg_operands(R)) { + if (!MO.readsReg()) + continue; + if (MO.getParent() == &CopyMI) + continue; + + LaneBitmask M = laneMaskForRegOperand(MO, R, MRI, TRI); + if ((M & DMask).none()) + continue; + + unsigned Sub = MO.getSubReg(); + + if (Sub != DSub) + return false; + + MachineInstr *UseMI = MO.getParent(); + if (UseMI->getParent() != CopyMI.getParent()) + return false; + if (!instrIsBeforeInSameBB(&CopyMI, UseMI)) + return false; + + UseOps.push_back(&MO); + } + + for (MachineOperand *MO : UseOps) + MO->setSubReg(SSub); + + LIS.RemoveMachineInstrFromMaps(CopyMI); + CopyMI.eraseFromParent(); + + if (LIS.hasInterval(R)) { + LIS.removeInterval(R); + LIS.createAndComputeVirtRegInterval(R); + } + return true; +} + +/// Walk [Begin, End) and fold / sink same-vreg subreg COPYs (iterator-safe). +static bool cleanupSameVRegSubregCopiesInRange( + MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, + MachineRegisterInfo &MRI, LiveIntervals &LIS, const SIRegisterInfo &TRI) { + bool Changed = false; + SmallVector<MachineInstr *, 16> Copies; + for (auto I = Begin; I != End; ++I) { + if (I->getOpcode() == AMDGPU::COPY) + Copies.push_back(&*I); + } + for (MachineInstr *MI : Copies) { + if (tryFoldSameVRegSubregCopy(*MI, MRI, LIS, TRI)) + Changed = true; + } + Copies.clear(); + for (auto I = Begin; I != End; ++I) { + if (I->getOpcode() == AMDGPU::COPY) + Copies.push_back(&*I); + } + for (MachineInstr *MI : Copies) { + if (trySinkSameVRegSubregCopy(*MI, MRI, LIS, TRI, End)) + Changed = true; + } + return Changed; +} + +static bool performF32Unpacking(MachineInstr &I, const SIInstrInfo *TII, + const SIRegisterInfo *TRI, LiveIntervals &LIS, + DenseSet<Register> *InvolvedUnpackRegs) { + uint32_t UnpackedOpcode = mapToUnpackedOpcode(I); + if (UnpackedOpcode == std::numeric_limits<uint32_t>::max()) { + debugSkipVPKUnpack(I, TII, TRI, "unsupported_opcode"); + return false; + } + if (canUnpackingClobberRegister(I, TII, TRI)) { + debugSkipVPKUnpack(I, TII, TRI, "would_clobber_overlapping_src"); + return false; + } + + const MachineOperand &DstMO = I.getOperand(0); + Register DstReg = DstMO.getReg(); + if (!DstReg.isVirtual()) { + debugSkipVPKUnpack(I, TII, TRI, "physical_dst"); + return false; + } + + unsigned DstSub = DstMO.getSubReg(); + + MachineFunction *MF = I.getMF(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); + + // Lane indices for subreg defs: full vreg_64 uses sub0/sub1. For composite + // 64-bit packed destinations (sub0_sub1, sub2_sub3, sub4_sub5, sub6_sub7, + // … on wide vectors), decompose with composeSubRegIndices — not getSubReg, + // whose first operand is an MCRegister, not a SubRegIndex (SIInstrInfo uses + // the same compose pattern for partial subregs). + unsigned LoSeqIdx = DstSub ? TRI->composeSubRegIndices(DstSub, AMDGPU::sub0) + : static_cast<unsigned>(AMDGPU::sub0); + unsigned HiSeqIdx = DstSub ? TRI->composeSubRegIndices(DstSub, AMDGPU::sub1) + : static_cast<unsigned>(AMDGPU::sub1); + if (DstSub && (!LoSeqIdx || !HiSeqIdx)) { + debugSkipVPKUnpack(I, TII, TRI, "bad_composite_subreg_decompose", LoSeqIdx, + HiSeqIdx); + return false; + } + + const bool LoRCValid = TRI->isSubRegValidForRegClass(DstRC, LoSeqIdx); + const bool HiRCValid = TRI->isSubRegValidForRegClass(DstRC, HiSeqIdx); + if (!LoRCValid || !HiRCValid) { + debugSkipVPKUnpack(I, TII, TRI, "invalid_subreg_for_regclass", LoSeqIdx, + HiSeqIdx, LoRCValid, HiRCValid); + return false; + } + + Register LaneSrcBase = DstReg; + + const bool UndefOnFirstLane = !vpkAnySrcUsesDst(I, TII, DstReg) && + !hasExplicitDefOfRegBefore(I, DstReg); + + SmallVector<Register, 4> CleanCandidates; + auto AddSrcCandidate = [&](const MachineOperand *MO) { + if (!MO || !MO->isReg()) + return; + Register R = MO->getReg(); + if (!R.isVirtual() || R == DstReg) + return; + if (!isVirtualSrcRegOnlyUsedByThisVPK(R, I, MRI)) + return; + CleanCandidates.push_back(R); + }; + AddSrcCandidate(TII->getNamedOperand(I, AMDGPU::OpName::src0)); + AddSrcCandidate(TII->getNamedOperand(I, AMDGPU::OpName::src1)); + if (AMDGPU::hasNamedOperand(I.getOpcode(), AMDGPU::OpName::src2)) + AddSrcCandidate(TII->getNamedOperand(I, AMDGPU::OpName::src2)); + llvm::sort(CleanCandidates); + CleanCandidates.erase(llvm::unique(CleanCandidates), CleanCandidates.end()); + + SmallVector<Register, 4> SrcRegsForInvolvedSet; + auto AddInvolvedSrc = [&](const MachineOperand *MO) { + if (!MO || !MO->isReg()) + return; + Register R = MO->getReg(); + if (R.isVirtual()) + SrcRegsForInvolvedSet.push_back(R); + }; + AddInvolvedSrc(TII->getNamedOperand(I, AMDGPU::OpName::src0)); + AddInvolvedSrc(TII->getNamedOperand(I, AMDGPU::OpName::src1)); + if (AMDGPU::hasNamedOperand(I.getOpcode(), AMDGPU::OpName::src2)) + AddInvolvedSrc(TII->getNamedOperand(I, AMDGPU::OpName::src2)); + + MachineBasicBlock *MBB = I.getParent(); + + LIS.RemoveMachineInstrFromMaps(I); + + // BuildMI(MBB, I, ...) prepends each instruction immediately before I, so the + // first BuildMI call ends up furthest from I in program order: low lane, then + // high lane. + MachineInstrBuilder Op0L = createUnpackedMI(I, TII, TRI, UnpackedOpcode, + /*IsHiBits=*/false, DstReg, + LoSeqIdx, LaneSrcBase, DstSub, + /*UndefOnDef=*/UndefOnFirstLane); + LIS.InsertMachineInstrInMaps(*Op0L); + MachineInstrBuilder Op0H = + createUnpackedMI(I, TII, TRI, UnpackedOpcode, + /*IsHiBits=*/true, DstReg, HiSeqIdx, LaneSrcBase, DstSub, + /*UndefOnDef=*/false); + LIS.InsertMachineInstrInMaps(*Op0H); + + uint32_t IFlags = I.getFlags(); + Op0L->setFlags(IFlags); + Op0H->setFlags(IFlags); + + I.eraseFromParent(); + + MF->getProperties().reset(MachineFunctionProperties::Property::IsSSA); + + SmallVector<Register, 16> ExtraRegs; + for (Register R : CleanCandidates) + eraseRedundantCopyDefsForRegIfUnused(R, *MBB, MRI, LIS, ExtraRegs); + + SmallVector<MachineInstr *, 5> ToRecompute; + ToRecompute.push_back(&*Op0L); + ToRecompute.push_back(&*Op0H); + recomputeIntervalsAfterVirtualUnpack(ToRecompute, LIS, ExtraRegs); + if (InvolvedUnpackRegs) { + InvolvedUnpackRegs->insert(DstReg); + for (Register R : SrcRegsForInvolvedSet) + InvolvedUnpackRegs->insert(R); + } + return true; +} + +/// True if \p R is a 64-bit VGPR/AGPR virtual register and every register +/// operand uses only \p sub0 or \p sub1 (no full-register or other subregs). +static bool is64BitOnlySub0Sub1(Register R, const MachineRegisterInfo &MRI, + const SIRegisterInfo &TRI) { + if (!R.isVirtual()) + return false; + const TargetRegisterClass *RC = MRI.getRegClass(R); + if (!RC || TRI.getRegSizeInBits(*RC) != 64) + return false; + if (!TRI.isVGPR(MRI, R) && !TRI.isAGPR(MRI, R)) + return false; + + bool Any = false; + for (const MachineOperand &MO : MRI.reg_operands(R)) { + Any = true; + unsigned S = MO.getSubReg(); + if (S == AMDGPU::NoSubRegister) + return false; + if (S != AMDGPU::sub0 && S != AMDGPU::sub1) + return false; + } + return Any; +} + +static bool split64BitOnlySub0Sub1ToV32(Register R, MachineRegisterInfo &MRI, + const SIRegisterInfo &TRI, + LiveIntervals &LIS) { + const TargetRegisterClass *RC = MRI.getRegClass(R); + const TargetRegisterClass *LoRC = TRI.getSubRegisterClass(RC, AMDGPU::sub0); + const TargetRegisterClass *HiRC = TRI.getSubRegisterClass(RC, AMDGPU::sub1); + if (!LoRC || !HiRC) + return false; + + bool HasLo = false, HasHi = false; + for (const MachineOperand &MO : MRI.reg_operands(R)) { + unsigned S = MO.getSubReg(); + if (S == AMDGPU::sub0) + HasLo = true; + else if (S == AMDGPU::sub1) + HasHi = true; + } + if (!HasLo && !HasHi) + return false; + + Register NewLo, NewHi; + if (HasLo) + NewLo = MRI.createVirtualRegister(LoRC); + if (HasHi) + NewHi = MRI.createVirtualRegister(HiRC); + + SmallVector<MachineOperand *, 32> Ops; + for (MachineOperand &MO : MRI.reg_operands(R)) + Ops.push_back(&MO); + + for (MachineOperand *MO : Ops) { + unsigned S = MO->getSubReg(); + Register NR = (S == AMDGPU::sub0) ? NewLo : NewHi; + MO->setReg(NR); + MO->setSubReg(AMDGPU::NoSubRegister); + // Partial subreg defs used UndefOnDef for the wide vreg; each new vreg_32 + // is fully written by its instruction — drop stale undef on defs. + if (MO->isDef()) + MO->setIsUndef(false); + } + + if (LIS.hasInterval(R)) + LIS.removeInterval(R); + if (HasLo) + LIS.createAndComputeVirtRegInterval(NewLo); + if (HasHi) + LIS.createAndComputeVirtRegInterval(NewHi); + + LLVM_DEBUG({ + dbgs() << DEBUG_TYPE << ": post-unpack split " << printReg(R, &TRI) + << " -> "; + if (HasLo) + dbgs() << printReg(NewLo, &TRI) << '(' << TRI.getRegClassName(LoRC) + << ')'; + if (HasLo && HasHi) + dbgs() << ", "; + if (HasHi) + dbgs() << printReg(NewHi, &TRI) << '(' << TRI.getRegClassName(HiRC) + << ')'; + dbgs() << '\n'; + }); + + return true; +} + +/// Split 64-bit VGPR/AGPR vregs that only use \p sub0 / \p sub1 into 32-bit +/// vregs. Only considers virtual registers that appeared as a V_PK +/// destination or as a virtual src0/src1/src2 on a successful unpack in this +/// pass (\p InvolvedUnpackRegs). Only the first \p MaxRegsToProcess candidates +/// in virtual-register index order are considered (UINT_MAX = all). +static bool postUnpackSplit64BitSubregsToV32( + MachineFunction &MF, LiveIntervals &LIS, const SIRegisterInfo &TRI, + unsigned MaxRegsToProcess, const DenseSet<Register> &InvolvedUnpackRegs) { + assert(MaxRegsToProcess > 0); + if (InvolvedUnpackRegs.empty()) + return false; + + MachineRegisterInfo &MRI = MF.getRegInfo(); + const unsigned NumVR = MRI.getNumVirtRegs(); + SmallVector<Register, 64> Candidates; + Candidates.reserve(InvolvedUnpackRegs.size()); + for (unsigned I = 0; I < NumVR; ++I) { + Register R = Register::index2VirtReg(I); + if (!InvolvedUnpackRegs.contains(R)) + continue; + if (is64BitOnlySub0Sub1(R, MRI, TRI)) + Candidates.push_back(R); + } + llvm::sort(Candidates, [](Register A, Register B) { + return A.virtRegIndex() < B.virtRegIndex(); + }); + + const bool Unlimited = + MaxRegsToProcess == std::numeric_limits<unsigned>::max(); + + bool Changed = false; + unsigned Seen = 0; + for (Register R : Candidates) { + if (!Unlimited && Seen >= MaxRegsToProcess) + break; + ++Seen; + if (split64BitOnlySub0Sub1ToV32(R, MRI, TRI, LIS)) + Changed = true; + } + return Changed; +} + +class AMDGPUIGLPUnpackImpl { + LiveIntervals *LIS; + const SIInstrInfo *TII = nullptr; + const SIRegisterInfo *TRI = nullptr; + +public: + explicit AMDGPUIGLPUnpackImpl(LiveIntervals *L) : LIS(L) {} + + bool run(MachineFunction &MF); +}; + +bool AMDGPUIGLPUnpackImpl::run(MachineFunction &MF) { + if (!LIS) + return false; + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + const MCInstrInfo &II = *TII; + + SmallVector< + std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 8> + SubRegions; + + bool Changed = false; + bool AnyVPKUnpacked = false; + DenseSet<Register> InvolvedUnpackRegs; + + for (MachineBasicBlock &MBB : MF) { + splitMBBBySchedBarriers(MBB, *TII, SubRegions); + + for (auto [Beg, End] : SubRegions) { + CandidateRegion CR; + if (!findCandidateRegion(MBB, Beg, End, II, CR)) { + LLVM_DEBUG({ + unsigned NumVPK = 0; + bool HasValuSpacingIGLP = false; + for (auto It = Beg; It != End; ++It) { + if (isVPKOpcode(II, It->getOpcode())) + ++NumVPK; + if (It->getOpcode() == AMDGPU::IGLP_OPT && + It->getNumOperands() >= 1 && It->getOperand(0).isImm() && + It->getOperand(0).getImm() == + static_cast<int64_t>( + AMDGPU::IGLPStrategyID::MFMAValuSpacingOptID)) + HasValuSpacingIGLP = true; + } + if (NumVPK > 0 && kRequireMFMAValuSpacingIGLPOptForRegion && + !HasValuSpacingIGLP) + dbgs() << DEBUG_TYPE << ": skip region MBB#" << MBB.getNumber() + << " (" << NumVPK << " V_PK, missing IGLP_OPT imm=" + << static_cast<int>( + AMDGPU::IGLPStrategyID::MFMAValuSpacingOptID) + << ")\n"; + }); + continue; + } + + if (schedRegionHasExplicitAllocatablePhysReg(Beg, End, MF.getRegInfo())) { + LLVM_DEBUG({ + dbgs() << DEBUG_TYPE << ": skip region MBB#" << MBB.getNumber() + << " (explicit allocatable physical register operand)\n"; + }); + continue; + } + + const unsigned MaxVPK = AMDGPUIGLPUnpackMaxVPKPerRegion; + LLVM_DEBUG({ + dbgs() << DEBUG_TYPE << ": " << MF.getName() << " MBB#" + << MBB.getNumber() + << " region instrs=" << std::distance(Beg, End) + << " v_pk=" << CR.VPKInsts.size(); + if (MaxVPK > 0) + dbgs() << " max_vpk_unpack=" << MaxVPK; + dbgs() << "\n"; + }); + + unsigned UnpackSlot = 0; + for (MachineInstr *MI : CR.VPKInsts) { + if (MaxVPK > 0 && UnpackSlot >= MaxVPK) { + LLVM_DEBUG({ + dbgs() << DEBUG_TYPE << ": skip remaining V_PK in region (limit " + << MaxVPK << ")\n"; + }); + break; + } + ++UnpackSlot; + if (performF32Unpacking(*MI, TII, TRI, *LIS, &InvolvedUnpackRegs)) { + Changed = true; + AnyVPKUnpacked = true; + } + } + + if (cleanupSameVRegSubregCopiesInRange(CR.Begin, CR.End, MF.getRegInfo(), + *LIS, *TRI)) + Changed = true; + } + } + + if (AnyVPKUnpacked) { + const unsigned PostMax = AMDGPUIGLPUnpackPostCleanupMaxVRegs; + if (PostMax > 0) { + if (postUnpackSplit64BitSubregsToV32(MF, *LIS, *TRI, PostMax, + InvolvedUnpackRegs)) + Changed = true; + } + } + + return Changed; +} + +class AMDGPUIGLPUnpackLegacy : public MachineFunctionPass { +public: + static char ID; + + AMDGPUIGLPUnpackLegacy() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override { + LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS(); + return AMDGPUIGLPUnpackImpl(&LIS).run(MF); + } + + StringRef getPassName() const override { return "AMDGPU IGLP unpack"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + // Like GCNRewritePartialRegUses: preserving LIS/SlotIndexes avoids the + // legacy PM recomputing them before MachineScheduler when we make no MIR + // changes. + AU.setPreservesCFG(); + AU.addRequired<LiveIntervalsWrapperPass>(); + AU.addPreserved<LiveIntervalsWrapperPass>(); + AU.addPreserved<SlotIndexesWrapperPass>(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // namespace + +char AMDGPUIGLPUnpackLegacy::ID = 0; + +INITIALIZE_PASS_BEGIN(AMDGPUIGLPUnpackLegacy, DEBUG_TYPE, "AMDGPU IGLP unpack", + false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) +INITIALIZE_PASS_END(AMDGPUIGLPUnpackLegacy, DEBUG_TYPE, "AMDGPU IGLP unpack", + false, false) + +char &llvm::AMDGPUIGLPUnpackID = AMDGPUIGLPUnpackLegacy::ID; + +PreservedAnalyses +AMDGPUIGLPUnpackPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + LiveIntervals &LIS = MFAM.getResult<LiveIntervalsAnalysis>(MF); + AMDGPUIGLPUnpackImpl Impl(&LIS); + if (!Impl.run(MF)) + return PreservedAnalyses::all(); + + auto PA = getMachineFunctionPassPreservedAnalyses(); + PA.preserveSet<CFGAnalyses>(); + return PA; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index d49ec90e4c212..cf8caebf08813 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -819,13 +819,6 @@ void PipelineSolver::solve() { LLVM_DEBUG(DAG->dump()); } -enum IGLPStrategyID : int { - MFMASmallGemmOptID = 0, - MFMASmallGemmSingleWaveOptID = 1, - MFMAExpInterleaveID = 2, - MFMAExpSimpleInterleaveID = 3 -}; - // Implement a IGLP scheduling strategy. class IGLPStrategy { protected: @@ -896,6 +889,76 @@ bool MFMASmallGemmOpt::applyIGLPStrategy( return true; } +static bool isMFMAValuSpacingGapValu(const MachineInstr &MI, + const SIInstrInfo *TII) { + if (MI.isMetaInstruction()) + return false; + return TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI) && + !MI.mayLoadOrStore(); +} + +/// Interleave MFMA/WMMA with VALU slots: each repeating stage is one MFMA (or +/// WMMA), then up to N VALU ops per gap where N = floor(#VALU / #MFMA) in this +/// schedule region (same predicate as \c isMFMAValuSpacingGapValu), at least 1. +/// Template length uses MFMACount * 3 for slack, like MFMASmallGemmOpt. +/// \p IsBottomUp is false so SchedGroup pipeline order matches forward program +/// order (MFMA before its VALU gap). +class MFMAValuSpacingOpt final : public IGLPStrategy { +public: + bool applyIGLPStrategy( + DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, + DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, + AMDGPU::SchedulingPhase Phase) override; + + bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, + AMDGPU::SchedulingPhase Phase) override { + for (const MachineInstr &I : *DAG) + if (TII->isMFMAorWMMA(I)) + return true; + return false; + } + + MFMAValuSpacingOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) + : IGLPStrategy(DAG, TII) { + IsBottomUp = false; + } +}; + +bool MFMAValuSpacingOpt::applyIGLPStrategy( + DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, + DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, + AMDGPU::SchedulingPhase Phase) { + unsigned MFMACount = 0; + unsigned ValuCount = 0; + for (const MachineInstr &I : *DAG) { + if (TII->isMFMAorWMMA(I)) + ++MFMACount; + else if (isMFMAValuSpacingGapValu(I, TII)) + ++ValuCount; + } + + unsigned ValuGap = 1; + if (MFMACount > 0) { + ValuGap = ValuCount / MFMACount; + if (ValuGap < 1) + ValuGap = 1; + } + + const unsigned PipelineSyncID = 0; + SchedGroup *SG = nullptr; + for (unsigned I = 0; I < MFMACount * 3; ++I) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, ValuGap, PipelineSyncID, DAG, TII); + SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]); + } + + return true; +} + class MFMAExpInterleaveOpt final : public IGLPStrategy { private: // The count of TRANS SUs involved in the interleaved pipeline @@ -2316,17 +2379,19 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( } static std::unique_ptr<IGLPStrategy> -createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG, +createIGLPStrategy(AMDGPU::IGLPStrategyID ID, ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) { switch (ID) { - case MFMASmallGemmOptID: + case AMDGPU::IGLPStrategyID::MFMASmallGemmOptID: return std::make_unique<MFMASmallGemmOpt>(DAG, TII); - case MFMASmallGemmSingleWaveOptID: + case AMDGPU::IGLPStrategyID::MFMASmallGemmSingleWaveOptID: return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII); - case MFMAExpInterleaveID: + case AMDGPU::IGLPStrategyID::MFMAExpInterleaveID: return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII); - case MFMAExpSimpleInterleaveID: + case AMDGPU::IGLPStrategyID::MFMAExpSimpleInterleaveID: return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, TII); + case AMDGPU::IGLPStrategyID::MFMAValuSpacingOptID: + return std::make_unique<MFMAValuSpacingOpt>(DAG, TII); } llvm_unreachable("Unknown IGLPStrategyID"); @@ -2702,8 +2767,8 @@ void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage( } bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { - IGLPStrategyID StrategyID = - (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm(); + auto StrategyID = static_cast<AMDGPU::IGLPStrategyID>( + SU.getInstr()->getOperand(0).getImm()); auto S = createIGLPStrategy(StrategyID, DAG, TII); if (!S->shouldApplyStrategy(DAG, Phase)) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h index 0688f07873493..3b8ecd1fc14e9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H +#include "llvm/CodeGen/MachinePassManager.h" #include "llvm/CodeGen/ScheduleDAGMutation.h" #include <memory> @@ -17,11 +18,31 @@ namespace llvm { namespace AMDGPU { // The current phase of instruction scheduling enum class SchedulingPhase { Initial, PreRAReentry, PostRA }; + +/// \c llvm.amdgcn.iglp.opt / \c IGLP_OPT immediate; must match +/// \c createIGLPStrategy in AMDGPUIGroupLP.cpp. +enum class IGLPStrategyID : int { + MFMASmallGemmOptID = 0, + MFMASmallGemmSingleWaveOptID = 1, + MFMAExpInterleaveID = 2, + MFMAExpSimpleInterleaveID = 3, + MFMAValuSpacingOptID = 4, +}; + } // namespace AMDGPU std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase); +/// V_PK unpack and related MIR cleanup before the pre-RA scheduler. By default, +/// a schedule region is transformed only when it contains \c IGLP_OPT with +/// immediate \c MFMAValuSpacingOptID (4) as well as V_PK ops. +class AMDGPUIGLPUnpackPass : public PassInfoMixin<AMDGPUIGLPUnpackPass> { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + } // namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 8a046e83548cc..ff464ca657970 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -112,6 +112,7 @@ MACHINE_FUNCTION_ANALYSIS("amdgpu-resource-usage", AMDGPUResourceUsageAnalysis(* #ifndef MACHINE_FUNCTION_PASS #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS) #endif +MACHINE_FUNCTION_PASS("amdgpu-iglp-unpack", AMDGPUIGLPUnpackPass()) MACHINE_FUNCTION_PASS("amdgpu-insert-delay-alu", AMDGPUInsertDelayAluPass()) MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this)) MACHINE_FUNCTION_PASS("amdgpu-lower-vgpr-encoding", AMDGPULowerVGPREncodingPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index daa9f933fce59..5ca481deef518 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -556,6 +556,11 @@ static cl::opt<bool> EnablePreRAOptimizations( cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), cl::Hidden); +static cl::opt<bool> EnableAMDGPUIGLPUnpack( + "amdgpu-enable-iglp-unpack", + cl::desc("Run AMDGPU IGLP unpack pass before pre-RA scheduling"), + cl::init(true), cl::Hidden); + static cl::opt<bool> EnablePromoteKernelArguments( "amdgpu-enable-promote-kernel-arguments", cl::desc("Enable promotion of flat kernel pointer arguments to global"), @@ -721,6 +726,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeGCNNSAReassignLegacyPass(*PR); initializeGCNPreRAOptimizationsLegacyPass(*PR); initializeGCNPreRALongBranchRegLegacyPass(*PR); + initializeAMDGPUIGLPUnpackLegacyPass(*PR); initializeGCNRewritePartialRegUsesLegacyPass(*PR); initializeGCNRegPressurePrinterPass(*PR); initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR); @@ -1724,6 +1730,9 @@ void GCNPassConfig::addOptimizedRegAlloc() { if (EnableRewritePartialRegUses) insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID); + if (EnableAMDGPUIGLPUnpack) + insertPass(&RenameIndependentSubregsID, &AMDGPUIGLPUnpackID); + if (isPassEnabled(EnablePreRAOptimizations)) insertPass(&MachineSchedulerID, &GCNPreRAOptimizationsID); @@ -2498,6 +2507,9 @@ Error AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc( if (EnableRewritePartialRegUses) insertPass<RenameIndependentSubregsPass>(GCNRewritePartialRegUsesPass()); + if (EnableAMDGPUIGLPUnpack) + insertPass<RenameIndependentSubregsPass>(AMDGPUIGLPUnpackPass()); + if (isPassEnabled(EnablePreRAOptimizations)) insertPass<MachineSchedulerPass>(GCNPreRAOptimizationsPass()); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 73d2957516539..95ce25efdf7b9 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -88,6 +88,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUMemoryUtils.cpp AMDGPUCoExecSchedStrategy.cpp AMDGPUIGroupLP.cpp + AMDGPUIGroupFixup.cpp AMDGPULowerVGPREncoding.cpp AMDGPUMCResourceInfo.cpp AMDGPUMarkLastScratchLoad.cpp diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index ad24bad1fd5d7..caef31da175fb 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -3244,6 +3244,21 @@ static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { }); } +static bool hasIGLPOpt(ScheduleDAGInstrs *DAG, AMDGPU::IGLPStrategyID ID) { + return any_of(*DAG, [ID](MachineBasicBlock::iterator MI) { + return MI->getOpcode() == AMDGPU::IGLP_OPT && MI->getNumOperands() >= 1 && + MI->getOperand(0).isImm() && + MI->getOperand(0).getImm() == static_cast<int64_t>(ID); + }); +} + +static bool hasSchedBarrier(ScheduleDAGInstrs *DAG) { + return any_of(*DAG, [](MachineBasicBlock::iterator MI) { + unsigned Opc = MI->getOpcode(); + return Opc == AMDGPU::SCHED_BARRIER || Opc == AMDGPU::SCHED_GROUP_BARRIER; + }); +} + GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive( MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S, bool RemoveKillFlags) @@ -3252,6 +3267,17 @@ GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive( void GCNPostScheduleDAGMILive::schedule() { HasIGLPInstrs = hasIGLPInstrs(this); if (HasIGLPInstrs) { + // MFMAValuSpacingOpt is a pre-RA strategy whose interleaving is correct + // after the initial machine scheduler. The post-RA scheduler would undo + // the reordering, so preserve the pre-RA schedule by skipping here. + // When SCHED_[GROUP_]BARRIER coexists with IGLP_OPT, IGroupLP ignores the + // IGLP_OPT (they are mutually exclusive), so let post-RA scheduling proceed + // normally. + if (hasIGLPOpt(this, AMDGPU::IGLPStrategyID::MFMAValuSpacingOptID) && + !hasSchedBarrier(this)) { + HasIGLPInstrs = false; + return; + } SavedMutations.clear(); SavedMutations.swap(Mutations); addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA)); diff --git a/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.ll b/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.ll new file mode 100644 index 0000000000000..fad3dde3d06bf --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.ll @@ -0,0 +1,72 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Full codegen on gfx950. Two MFMAs fed by loaded floats; three independent +; i32 muls stored to a second buffer. sched.barrier(0) isolates the MUL+MFMA +; region so that address-computation VALUs don't inflate the VALU gap in +; MFMAValuSpacingOpt. +; +; With iglp_opt(4) the expected MFMA/VALU interleaving (ValuGap=1) is: +; MFMA, MUL, MFMA, MUL, MUL +; +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -o - %s | FileCheck %s + +declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) +declare i32 @llvm.amdgcn.workitem.id.x() +declare void @llvm.amdgcn.iglp.opt(i32 immarg) +declare void @llvm.amdgcn.sched.barrier(i32 immarg) + +define amdgpu_kernel void @mfma_valu_iglp4(ptr addrspace(1) %p, ptr addrspace(1) %q) #0 { +; CHECK-LABEL: mfma_valu_iglp4: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: v_lshlrev_b32_e32 v8, 2, v0 +; CHECK-NEXT: v_mov_b32_e32 v9, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx2 v[6:7], v8, s[0:1] +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; CHECK-NEXT: ; sched_barrier mask(0x00000000) +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v6, v7, v[0:3] +; CHECK-NEXT: v_mul_lo_u32 v4, v6, v6 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v6, v7, v[0:3] +; CHECK-NEXT: v_mul_lo_u32 v5, v6, v7 +; CHECK-NEXT: v_mul_lo_u32 v6, v7, v7 +; CHECK-NEXT: ; iglp_opt mask(0x00000004) +; CHECK-NEXT: ; sched_barrier mask(0x00000000) +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; CHECK-NEXT: global_store_dwordx3 v8, v[4:6], s[2:3] +; CHECK-NEXT: s_endpgm +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %t = load <4 x float>, ptr addrspace(1) %p + %gep0 = getelementptr inbounds float, ptr addrspace(1) %p, i32 %tid + %gep1 = getelementptr inbounds float, ptr addrspace(1) %gep0, i32 1 + %f0 = load float, ptr addrspace(1) %gep0 + %f1 = load float, ptr addrspace(1) %gep1 + %i0 = bitcast float %f0 to i32 + %i1 = bitcast float %f1 to i32 + call void @llvm.amdgcn.sched.barrier(i32 0) + %m0 = mul nsw i32 %i0, %i0 + %m1 = mul nsw i32 %i0, %i1 + %m2 = mul nsw i32 %i1, %i1 + call void @llvm.amdgcn.iglp.opt(i32 4) + %mai = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %f0, float %f1, <4 x float> %t, i32 0, i32 0, i32 0) + %mai2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %f0, float %f1, <4 x float> %mai, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.sched.barrier(i32 0) + store <4 x float> %mai2, ptr addrspace(1) %p + %qgep0 = getelementptr inbounds i32, ptr addrspace(1) %q, i32 %tid + %qgep1 = getelementptr inbounds i32, ptr addrspace(1) %qgep0, i32 1 + %qgep2 = getelementptr inbounds i32, ptr addrspace(1) %qgep0, i32 2 + store i32 %m0, ptr addrspace(1) %qgep0 + store i32 %m1, ptr addrspace(1) %qgep1 + store i32 %m2, ptr addrspace(1) %qgep2 + ret void +} + +attributes #0 = { "uniform-work-group-size"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.mir b/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.mir new file mode 100644 index 0000000000000..7f173ddc8e5aa --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.mir @@ -0,0 +1,35 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# Pre-RA machine scheduler with IGroupLP / MFMAValuSpacingOpt (iglp_opt(4)). +# With IGLP_OPT 4 the expected MFMA/VALU interleaving (ValuGap=1) is: +# MFMA, MUL, MFMA, MUL, MUL +# +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler -o - %s | FileCheck %s + +--- +name: mfma_valu_iglp4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: mfma_valu_iglp4 + ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 [[DEF]], [[DEF1]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DEF]], [[DEF]], implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 [[DEF]], [[DEF1]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DEF]], [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DEF1]], [[DEF1]], implicit $exec + ; CHECK-NEXT: IGLP_OPT 4 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_]], implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_1]] + %0:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:areg_128_align2 = IMPLICIT_DEF + %3:vgpr_32 = nsw V_MUL_LO_U32_e64 %0, %0, implicit $exec + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %0, %1, implicit $exec + %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %1, %1, implicit $exec + %6:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 %0, %1, %2, 0, 0, 0, implicit $mode, implicit $exec + %7:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 %0, %1, %6, 0, 0, 0, implicit $mode, implicit $exec + IGLP_OPT 4 + S_ENDPGM 0, implicit %3, implicit %4, implicit %5, implicit %7 + +... diff --git a/llvm/test/CodeGen/AMDGPU/iglp-unpack-mfma-vpk-gfx942.mir b/llvm/test/CodeGen/AMDGPU/iglp-unpack-mfma-vpk-gfx942.mir new file mode 100644 index 0000000000000..76b837dae3ea1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/iglp-unpack-mfma-vpk-gfx942.mir @@ -0,0 +1,40 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# Reduced from a large FMHA-style region (MFMA tower + VALU + V_PK_FMA_F32 + +# IGLP_OPT 4 between SCHED_BARRIERs). Pair with `iglp-unpack-mfma-vpk.ll` (IR). +# +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-iglp-unpack -o - %s | FileCheck %s + +--- +name: iglp_unpack_mfma_vpk +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: iglp_unpack_mfma_vpk + ; CHECK: SCHED_BARRIER 0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF + ; CHECK-NEXT: early-clobber %2:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 [[DEF]].sub0_sub1, [[DEF1]].sub0_sub1, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead [[V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_:%[0-9]+]]:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 [[DEF]].sub2_sub3, [[DEF1]].sub2_sub3, [[V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead [[DEF2:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; CHECK-NEXT: dead [[V_FMA_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, [[DEF5]].sub0, 0, [[DEF3]].sub0, 0, [[DEF4]].sub0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead [[V_FMA_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, [[DEF5]].sub1, 0, [[DEF3]].sub0, 0, [[DEF4]].sub1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: IGLP_OPT 4 + ; CHECK-NEXT: SCHED_BARRIER 0 + ; CHECK-NEXT: S_ENDPGM 0 + SCHED_BARRIER 0 + %0:vreg_128_align2 = IMPLICIT_DEF + %1:vreg_128_align2 = IMPLICIT_DEF + early-clobber %2:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %0.sub0_sub1:vreg_128_align2, %1.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec + %2:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %0.sub2_sub3:vreg_128_align2, %1.sub2_sub3:vreg_128_align2, %2:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec + %3:sgpr_256 = IMPLICIT_DEF + %8:sgpr_64 = IMPLICIT_DEF + %10:vreg_64_align2 = IMPLICIT_DEF + %28:vreg_64_align2 = IMPLICIT_DEF + %31:vreg_64_align2 = nofpexcept V_PK_FMA_F32 8, %28:vreg_64_align2, 0, %8:sgpr_64, 8, %10:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + IGLP_OPT 4 + SCHED_BARRIER 0 + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/iglp-unpack-mfma-vpk.ll b/llvm/test/CodeGen/AMDGPU/iglp-unpack-mfma-vpk.ll new file mode 100644 index 0000000000000..dc26df7f55783 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/iglp-unpack-mfma-vpk.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Making sure iglp_opt(4) will break V_PK_FMA into two V_FMA. +; +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -O2 -o - %s | FileCheck %s + +define amdgpu_kernel void @iglp_unpack_mfma_vpk_ir(ptr addrspace(1) %p, i64 %a, i64 %b) #0 { +; CHECK-LABEL: iglp_unpack_mfma_vpk_ir: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x10 +; CHECK-NEXT: ; kill: killed $sgpr4_sgpr5 +; CHECK-NEXT: v_mov_b32_e32 v16, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v18, s18 +; CHECK-NEXT: v_mov_b32_e32 v19, s19 +; CHECK-NEXT: ; sched_barrier mask(0x00000000) +; CHECK-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; CHECK-NEXT: v_mov_b64_e32 v[20:21], s[20:21] +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[16:17], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; CHECK-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; CHECK-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; CHECK-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; CHECK-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; CHECK-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[18:19], v[20:21], v[0:15] +; CHECK-NEXT: s_nop 10 +; CHECK-NEXT: v_fma_f32 v0, v0, s18, v0 +; CHECK-NEXT: v_fma_f32 v1, v1, s19, v1 +; CHECK-NEXT: ; iglp_opt mask(0x00000004) +; CHECK-NEXT: ; sched_barrier mask(0x00000000) +; CHECK-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; CHECK-NEXT: s_endpgm +entry: + call void @llvm.amdgcn.sched.barrier(i32 0) + %acc = load <16 x float>, ptr addrspace(1) %p + %m = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 %a, i64 %b, <16 x float> %acc, i32 0, i32 0, i32 0) + %e0 = extractelement <16 x float> %m, i64 0 + %e1 = extractelement <16 x float> %m, i64 1 + %v = insertelement <2 x float> poison, float %e0, i64 0 + %v2 = insertelement <2 x float> %v, float %e1, i64 1 + %c = load <2 x float>, ptr addrspace(1) %p + %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %v2, <2 x float> %c, <2 x float> %v2) + call void @llvm.amdgcn.iglp.opt(i32 4) + call void @llvm.amdgcn.sched.barrier(i32 0) + store <2 x float> %fma, ptr addrspace(1) %p + ret void +} + +declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64, i64, <16 x float>, i32, i32, i32) +declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) +declare void @llvm.amdgcn.sched.barrier(i32 immarg) +declare void @llvm.amdgcn.iglp.opt(i32 immarg) + +attributes #0 = { "uniform-work-group-size"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll index c49b2b927bd31..5f88d9eed5eac 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll @@ -201,6 +201,7 @@ ; GCN-O2-NEXT: register-coalescer ; GCN-O2-NEXT: rename-independent-subregs ; GCN-O2-NEXT: amdgpu-rewrite-partial-reg-uses +; GCN-O2-NEXT: amdgpu-iglp-unpack ; GCN-O2-NEXT: machine-scheduler ; GCN-O2-NEXT: amdgpu-pre-ra-optimizations ; GCN-O2-NEXT: si-wqm @@ -370,6 +371,7 @@ ; GCN-O3-NEXT: register-coalescer ; GCN-O3-NEXT: rename-independent-subregs ; GCN-O3-NEXT: amdgpu-rewrite-partial-reg-uses +; GCN-O3-NEXT: amdgpu-iglp-unpack ; GCN-O3-NEXT: machine-scheduler ; GCN-O3-NEXT: amdgpu-pre-ra-optimizations ; GCN-O3-NEXT: si-wqm diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index cf127b0bc0d3b..8ea844e401c14 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -360,6 +360,7 @@ ; GCN-O1-NEXT: Register Coalescer ; GCN-O1-NEXT: Rename Disconnected Subregister Components ; GCN-O1-NEXT: Rewrite Partial Register Uses +; GCN-O1-NEXT: AMDGPU IGLP unpack ; GCN-O1-NEXT: Machine Instruction Scheduler ; GCN-O1-NEXT: SI Whole Quad Mode ; GCN-O1-NEXT: SI optimize exec mask operations pre-RA @@ -676,6 +677,7 @@ ; GCN-O1-OPTS-NEXT: Register Coalescer ; GCN-O1-OPTS-NEXT: Rename Disconnected Subregister Components ; GCN-O1-OPTS-NEXT: Rewrite Partial Register Uses +; GCN-O1-OPTS-NEXT: AMDGPU IGLP unpack ; GCN-O1-OPTS-NEXT: Machine Instruction Scheduler ; GCN-O1-OPTS-NEXT: AMDGPU Pre-RA optimizations ; GCN-O1-OPTS-NEXT: SI Whole Quad Mode @@ -997,6 +999,7 @@ ; GCN-O2-NEXT: Register Coalescer ; GCN-O2-NEXT: Rename Disconnected Subregister Components ; GCN-O2-NEXT: Rewrite Partial Register Uses +; GCN-O2-NEXT: AMDGPU IGLP unpack ; GCN-O2-NEXT: Machine Instruction Scheduler ; GCN-O2-NEXT: AMDGPU Pre-RA optimizations ; GCN-O2-NEXT: SI Whole Quad Mode @@ -1332,6 +1335,7 @@ ; GCN-O3-NEXT: Register Coalescer ; GCN-O3-NEXT: Rename Disconnected Subregister Components ; GCN-O3-NEXT: Rewrite Partial Register Uses +; GCN-O3-NEXT: AMDGPU IGLP unpack ; GCN-O3-NEXT: Machine Instruction Scheduler ; GCN-O3-NEXT: AMDGPU Pre-RA optimizations ; GCN-O3-NEXT: SI Whole Quad Mode diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn index 75a97a8b840e5..d82bb3eacf1b8 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -152,6 +152,7 @@ static_library("LLVMAMDGPUCodeGen") { "AMDGPUHSAMetadataStreamer.cpp", "AMDGPUHazardLatency.cpp", "AMDGPUIGroupLP.cpp", + "AMDGPUIGLPUnpack.cpp", "AMDGPUISelDAGToDAG.cpp", "AMDGPUISelLowering.cpp", "AMDGPUImageIntrinsicOptimizer.cpp", _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
