https://github.com/hidekisaito updated 
https://github.com/llvm/llvm-project/pull/190723

>From 61f04b9c88eb92e00cc57f92dcbbeb4a292d4074 Mon Sep 17 00:00:00 2001
From: Hideki Saito <[email protected]>
Date: Mon, 6 Apr 2026 16:11:39 -0500
Subject: [PATCH] iglp_opt(4): Unpack V_PK F32 and then evenly space MFMAs and
 VALUs

---
 clang/test/CodeGenOpenCL/builtins-amdgcn.cl   |    6 +-
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   10 +-
 llvm/lib/IR/Verifier.cpp                      |   10 +
 llvm/lib/Target/AMDGPU/AMDGPU.h               |    3 +
 llvm/lib/Target/AMDGPU/AMDGPUIGroupFixup.cpp  | 1205 +++++++++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp     |   93 +-
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h       |   21 +
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |    1 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   12 +
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |    1 +
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |   26 +
 .../iglp-opt4-mfma-valu-spacing-scheduler.ll  |   72 +
 .../iglp-opt4-mfma-valu-spacing-scheduler.mir |   35 +
 .../AMDGPU/iglp-unpack-mfma-vpk-gfx942.mir    |   40 +
 .../CodeGen/AMDGPU/iglp-unpack-mfma-vpk.ll    |   59 +
 llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll  |    2 +
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |    4 +
 .../secondary/llvm/lib/Target/AMDGPU/BUILD.gn |    1 +
 18 files changed, 1583 insertions(+), 18 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUIGroupFixup.cpp
 create mode 100644 
llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.ll
 create mode 100644 
llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/iglp-unpack-mfma-vpk-gfx942.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/iglp-unpack-mfma-vpk.ll

diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 0fc40d396a87d..cd4cb103291dd 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -980,14 +980,16 @@ void test_sched_group_barrier()
 // CHECK-LABEL: @test_iglp_opt
 // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 0)
 // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 1)
+// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 2)
+// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 3)
 // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 4)
-// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 15)
 void test_iglp_opt()
 {
   __builtin_amdgcn_iglp_opt(0);
   __builtin_amdgcn_iglp_opt(1);
+  __builtin_amdgcn_iglp_opt(2);
+  __builtin_amdgcn_iglp_opt(3);
   __builtin_amdgcn_iglp_opt(4);
-  __builtin_amdgcn_iglp_opt(15);
 }
 
 // CHECK-LABEL: @test_s_sleep
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index f576972183eca..ebe48704bca4f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -377,8 +377,14 @@ def int_amdgcn_sched_group_barrier : 
ClangBuiltin<"__builtin_amdgcn_sched_group_
   [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, 
IntrHasSideEffects,
    IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
-// Scheduler optimization hint.
-//     MASK = 0: Small gemm opt
+// Scheduler optimization hint: immediate selects AMDGPU::IGLPStrategyID (see
+// AMDGPUIGroupLP.h). Must be a constant in [0,4]. Mappings:
+//   0 - MFMA small-GEMM scheduling (MFMASmallGemmOpt).
+//   1 - MFMA small-GEMM single-wave variant (MFMASmallGemmSingleWaveOpt).
+//   2 - MFMAExpInterleaveOpt (TRANS/MFMA scheduling pipeline; see 
AMDGPUIGroupLP.cpp).
+//   3 - MFMAExpSimpleInterleaveOpt (simpler TRANS-then-MFMA interleave 
pattern).
+//   4 - MFMA (or WMMA) / VALU spacing (MFMAValuSpacingOpt); also gates
+//       AMDGPUIGLPUnpack V_PK unpack when that pass is enabled.
 def int_amdgcn_iglp_opt : ClangBuiltin<"__builtin_amdgcn_iglp_opt">,
   Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 
IntrHasSideEffects, IntrConvergent,
                                 IntrWillReturn, IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 7d5dc91a1504e..6434298fe9062 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -7113,6 +7113,16 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, 
CallBase &Call) {
         "llvm.amdgcn.s.prefetch.data only supports global or constant memory");
     break;
   }
+  case Intrinsic::amdgcn_iglp_opt: {
+    const auto *Mask = dyn_cast<ConstantInt>(Call.getArgOperand(0));
+    Check(Mask, "llvm.amdgcn.iglp.opt requires a constant mask argument", 
&Call,
+          Call.getArgOperand(0));
+    const int64_t V = Mask->getSExtValue();
+    Check(V >= 0 && V <= 4,
+          "llvm.amdgcn.iglp.opt mask must be in the range [0,4]", &Call,
+          Call.getArgOperand(0));
+    break;
+  }
   case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
   case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
     Value *Src0 = Call.getArgOperand(0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 878f374110159..acbc19b43d32d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -559,6 +559,9 @@ extern char &GCNNSAReassignID;
 void initializeGCNPreRALongBranchRegLegacyPass(PassRegistry &);
 extern char &GCNPreRALongBranchRegID;
 
+void initializeAMDGPUIGLPUnpackLegacyPass(PassRegistry &);
+extern char &AMDGPUIGLPUnpackID;
+
 void initializeGCNPreRAOptimizationsLegacyPass(PassRegistry &);
 extern char &GCNPreRAOptimizationsID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupFixup.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUIGroupFixup.cpp
new file mode 100644
index 0000000000000..7b370a3a7aaa9
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupFixup.cpp
@@ -0,0 +1,1205 @@
+//===-- AMDGPUIGLPUnpack.cpp - AMDGPU IGLP unpack MIR cleanup 
-------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Machine IR cleanup pass that runs after rename-independent-subregs and
+// before the pre-RA machine scheduler.
+//
+// Schedule regions (for IGLP / packed-op analysis) are delimited by machine
+// basic block boundaries and by scheduling boundaries as defined by
+// SIInstrInfo::isSchedulingBoundary (e.g. SCHED_BARRIER with mask 0).
+//
+// V_PK * F32 unpacking lowers each packed op to two scalar VALUs that define
+// the destination register by subregs (e.g. vreg_64: undef %dst.sub0 = …;
+// %dst.sub1 = …), matching how wide accumulators are updated in real kernels
+// (full wide def, then per-lane subreg writes — see incoming FMHA MIR). A
+// single V_PK full-reg def leaves IsSSA set on the function; multiple subreg
+// defs require clearing IsSSA. Subreg indices follow the V_PK dest (sub0/sub1
+// for vreg_64; composite 64-bit lanes e.g. sub0_sub1 … sub6_sub7 on wide
+// vectors via composeSubRegIndices). Unpacked sources use the same compose
+// pattern as SIPreEmitPeephole (operand packed subreg composed with sub0/sub1
+// for the scalar lane). Undef on the first unpacked def only when the
+// destination vreg has no prior def in the block and the destination is not
+// also a packed source (otherwise lanes already hold defined values). Emit low
+// lane, then high lane (first BuildMI before V_PK is earliest in the block).
+//
+// COPY cleanup: when a packed source vreg is only used by this V_PK and is
+// populated solely by COPYs from another register (e.g. assembling a vreg_64
+// from two lanes of a wide vector), fold those COPY sources into the unpacked
+// VALUs and erase the now-dead COPY defs.
+//
+// Same-vreg subreg COPY (e.g. %v.sub1 = COPY %v.sub0): when every read of the
+// destination lane is an explicit DstSub use after the COPY, rewrite those to
+// SrcSub and remove the COPY. Full-register uses cannot be folded.
+//
+// Sink: when the COPY cannot be removed, move it down in the same MBB to
+// immediately before the first instruction that reads the destination lane
+// (or immediately before the first def that blocks sinking past it).
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUIGroupLP.h"
+#include "GCNSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/Analysis.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include <algorithm>
+#include <iterator>
+#include <limits>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-iglp-unpack"
+
+/// Max number of V_PK unpack attempts per schedule sub-region (program order
+/// within the region). 0 means unlimited. Resets for each sub-region. Does not
+/// affect --amdgpu-enable-iglp-unpack=false (full bypass).
+static cl::opt<unsigned> AMDGPUIGLPUnpackMaxVPKPerRegion(
+    "amdgpu-iglp-unpack-max-vpk-per-region",
+    cl::desc("Unpack at most this many V_PK instructions per schedule region "
+             "(0 = unlimited)"),
+    cl::init(0), cl::Hidden);
+
+/// After V_PK unpacking, optionally split 64-bit VGPR/AGPR vregs that are only
+/// referenced via \p sub0 / \p sub1 into 32-bit vregs. 0 skips this cleanup.
+/// Default is UINT_MAX (all eligible vregs). Unlike the V_PK-per-region cap,
+/// 0 does not mean unlimited.
+static cl::opt<unsigned> AMDGPUIGLPUnpackPostCleanupMaxVRegs(
+    "amdgpu-iglp-unpack-post-cleanup-max-vregs",
+    cl::desc("Post-unpack: split at most this many 64-bit VGPR/AGPR registers "
+             "to 32-bit (0 = skip; default = all)"),
+    cl::init(std::numeric_limits<unsigned>::max()), cl::Hidden);
+
+// === AMDGPUIGLP_UNPACK_POLICY ==============================================
+//
+// kRequireMFMAValuSpacingIGLPOptForRegion
+//   When true (default): only schedule sub-regions that contain IGLP_OPT with
+//   immediate == MFMAValuSpacingOptID are candidates for V_PK unpack. That
+//   pseudo must survive for the pre-RA scheduler (IGroupLP) to apply
+//   MFMAValuSpacingOpt.
+//   When false: any sub-region with at least one V_PK is a candidate; IGLP_OPT
+//   is not required.
+//
+static constexpr bool kRequireMFMAValuSpacingIGLPOptForRegion = true;
+// ============================================================================
+
+namespace {
+
+/// A schedule sub-region (between scheduling boundaries) that has at least one
+/// V_PK, and when kRequireMFMAValuSpacingIGLPOptForRegion also an IGLP_OPT 
with
+/// immediate MFMAValuSpacingOptID, for unpack / cleanup.
+struct CandidateRegion {
+  MachineBasicBlock *MBB = nullptr;
+  MachineBasicBlock::iterator Begin;
+  MachineBasicBlock::iterator End;
+  SmallVector<MachineInstr *, 8> VPKInsts;
+};
+
+static bool isVPKOpcode(const MCInstrInfo &II, unsigned Opc) {
+  return II.getName(Opc).starts_with("V_PK");
+}
+
+/// Log why a single V_PK was not unpacked. Enable with
+/// -debug-only=amdgpu-iglp-unpack.
+static void debugSkipVPKUnpack(const MachineInstr &MI, const SIInstrInfo *TII,
+                               const TargetRegisterInfo *TRI,
+                               const char *Reason, unsigned LoSeqIdx = 0,
+                               unsigned HiSeqIdx = 0, bool LoRCValid = true,
+                               bool HiRCValid = true) {
+  LLVM_DEBUG({
+    dbgs() << DEBUG_TYPE << ": skip unpack (" << Reason << ")";
+    if (MI.getNumOperands() > 0 && MI.getOperand(0).isReg()) {
+      dbgs() << " op=" << TII->getName(MI.getOpcode()) << " dst="
+             << printReg(MI.getOperand(0).getReg(), TRI,
+                         MI.getOperand(0).getSubReg());
+    }
+    if (LoSeqIdx && TRI) {
+      if (const char *N = TRI->getSubRegIndexName(LoSeqIdx))
+        dbgs() << " loSub=" << N;
+      else
+        dbgs() << " loSubIdx=" << LoSeqIdx;
+      if (!LoRCValid)
+        dbgs() << "[!validRC]";
+    }
+    if (HiSeqIdx && TRI) {
+      if (const char *N = TRI->getSubRegIndexName(HiSeqIdx))
+        dbgs() << " hiSub=" << N;
+      else
+        dbgs() << " hiSubIdx=" << HiSeqIdx;
+      if (!HiRCValid)
+        dbgs() << "[!validRC]";
+    }
+    dbgs() << "\n";
+  });
+}
+
+static void splitMBBBySchedBarriers(
+    MachineBasicBlock &MBB, const SIInstrInfo &TII,
+    SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
+                              MachineBasicBlock::iterator>> &Regions) {
+  Regions.clear();
+  MachineFunction &MF = *MBB.getParent();
+  MachineBasicBlock::iterator Start = MBB.begin();
+  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
+       ++I) {
+    if (TII.isSchedulingBoundary(*I, &MBB, MF)) {
+      Regions.push_back({Start, I});
+      Start = std::next(I);
+    }
+  }
+  Regions.push_back({Start, MBB.end()});
+}
+
+static bool findCandidateRegion(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator RegionBegin,
+                                MachineBasicBlock::iterator RegionEnd,
+                                const MCInstrInfo &II, CandidateRegion &Out) {
+  Out = CandidateRegion{};
+  bool HasMFMAValuSpacingIGLP = false;
+
+  for (MachineBasicBlock::iterator It = RegionBegin; It != RegionEnd; ++It) {
+    MachineInstr &MI = *It;
+    if (MI.getOpcode() == AMDGPU::IGLP_OPT && MI.getNumOperands() >= 1 &&
+        MI.getOperand(0).isImm() &&
+        MI.getOperand(0).getImm() ==
+            static_cast<int64_t>(AMDGPU::IGLPStrategyID::MFMAValuSpacingOptID))
+      HasMFMAValuSpacingIGLP = true;
+
+    if (isVPKOpcode(II, MI.getOpcode()))
+      Out.VPKInsts.push_back(&MI);
+  }
+
+  if (Out.VPKInsts.empty())
+    return false;
+  if (kRequireMFMAValuSpacingIGLPOptForRegion && !HasMFMAValuSpacingIGLP)
+    return false;
+
+  Out.MBB = &MBB;
+  Out.Begin = RegionBegin;
+  Out.End = RegionEnd;
+  return true;
+}
+
+// --- F32 unpack (aligned with SIPreEmitPeephole) ----------------------------
+
+/// Skip unpacking for a schedule sub-region if any instruction has an explicit
+/// operand using an allocatable physical register (pre-RA MIR is virtual; phys
+/// operands indicate an unusual/late state we do not transform).
+static bool
+schedRegionHasExplicitAllocatablePhysReg(MachineBasicBlock::iterator Begin,
+                                         MachineBasicBlock::iterator End,
+                                         const MachineRegisterInfo &MRI) {
+  for (MachineBasicBlock::iterator It = Begin; It != End; ++It) {
+    for (const MachineOperand &MO : It->operands()) {
+      if (!MO.isReg() || !MO.getReg().isPhysical())
+        continue;
+      if (MO.isImplicit())
+        continue;
+      if (!MRI.isAllocatable(MO.getReg()))
+        continue;
+      return true;
+    }
+  }
+  return false;
+}
+
+static uint32_t mapToUnpackedOpcode(const MachineInstr &I) {
+  switch (I.getOpcode()) {
+  case AMDGPU::V_PK_ADD_F32:
+  case AMDGPU::V_PK_ADD_F32_gfx12:
+    return AMDGPU::V_ADD_F32_e64;
+  case AMDGPU::V_PK_MUL_F32:
+  case AMDGPU::V_PK_MUL_F32_gfx12:
+    return AMDGPU::V_MUL_F32_e64;
+  case AMDGPU::V_PK_FMA_F32:
+  case AMDGPU::V_PK_FMA_F32_gfx12:
+    return AMDGPU::V_FMA_F32_e64;
+  default:
+    return std::numeric_limits<uint32_t>::max();
+  }
+}
+
+static bool canUnpackingClobberRegister(const MachineInstr &MI,
+                                        const SIInstrInfo *TII,
+                                        const SIRegisterInfo *TRI) {
+  Register DstReg = MI.getOperand(0).getReg();
+  // Virtual unpack lowers to subreg VALU defs; the classic post-RA clobber 
case
+  // only applies to physical destinations.
+  if (DstReg.isVirtual())
+    return false;
+
+  unsigned OpCode = MI.getOpcode();
+  Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0);
+
+  const MachineOperand *Src0MO = TII->getNamedOperand(MI, 
AMDGPU::OpName::src0);
+  if (Src0MO && Src0MO->isReg()) {
+    Register SrcReg0 = Src0MO->getReg();
+    unsigned Src0Mods =
+        TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
+    Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
+                             ? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
+                             : TRI->getSubReg(SrcReg0, AMDGPU::sub0);
+    if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
+      return true;
+  }
+
+  const MachineOperand *Src1MO = TII->getNamedOperand(MI, 
AMDGPU::OpName::src1);
+  if (Src1MO && Src1MO->isReg()) {
+    Register SrcReg1 = Src1MO->getReg();
+    unsigned Src1Mods =
+        TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
+    Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
+                             ? TRI->getSubReg(SrcReg1, AMDGPU::sub1)
+                             : TRI->getSubReg(SrcReg1, AMDGPU::sub0);
+    if (TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
+      return true;
+  }
+
+  if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
+    const MachineOperand *Src2MO =
+        TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+    if (Src2MO && Src2MO->isReg()) {
+      Register SrcReg2 = Src2MO->getReg();
+      unsigned Src2Mods =
+          TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm();
+      Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
+                               ? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
+                               : TRI->getSubReg(SrcReg2, AMDGPU::sub0);
+      if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
+        return true;
+    }
+  }
+  return false;
+}
+
+/// True if any packed source uses the same vreg as the destination
+/// (dst-as-src).
+static bool vpkAnySrcUsesDst(const MachineInstr &MI, const SIInstrInfo *TII,
+                             Register DstReg) {
+  unsigned Opc = MI.getOpcode();
+  const MachineOperand *S0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+  if (S0 && S0->isReg() && S0->getReg() == DstReg)
+    return true;
+  const MachineOperand *S1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+  if (S1 && S1->isReg() && S1->getReg() == DstReg)
+    return true;
+  if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src2)) {
+    const MachineOperand *S2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+    if (S2 && S2->isReg() && S2->getReg() == DstReg)
+      return true;
+  }
+  return false;
+}
+
+/// True if Reg has an explicit def in the same MBB strictly before I.
+static bool hasExplicitDefOfRegBefore(const MachineInstr &I, Register Reg) {
+  const MachineBasicBlock *MBB = I.getParent();
+  for (MachineBasicBlock::const_iterator It = MBB->begin(), E = 
I.getIterator();
+       It != E; ++It) {
+    for (const MachineOperand &MO : It->operands()) {
+      if (MO.isReg() && MO.isDef() && MO.getReg() == Reg)
+        return true;
+    }
+  }
+  return false;
+}
+
+/// True if every non-debug use of R is on \p VPK (same instruction may use R
+/// more than once, e.g. src0 and src1).
+static bool isVirtualSrcRegOnlyUsedByThisVPK(Register R, MachineInstr &VPK,
+                                             MachineRegisterInfo &MRI) {
+  if (!R.isVirtual())
+    return false;
+  if (MRI.use_nodbg_empty(R))
+    return false;
+  for (MachineInstr &U : MRI.use_nodbg_instructions(R)) {
+    if (&U != &VPK)
+      return false;
+  }
+  return true;
+}
+
+/// If a COPY in the same MBB strictly before VPK defines SrcReg with subreg
+/// NeedSubIdx from a register source, return true and set OutReg/OutSub.
+static bool tryFoldThroughSameBlockCopyDef(const MachineInstr &VPK,
+                                           Register SrcReg, unsigned 
NeedSubIdx,
+                                           Register &OutReg, unsigned &OutSub) 
{
+  const MachineBasicBlock *MBB = VPK.getParent();
+  for (MachineBasicBlock::const_iterator It = MBB->begin(),
+                                         E = VPK.getIterator();
+       It != E; ++It) {
+    if (It->getOpcode() != AMDGPU::COPY)
+      continue;
+    const MachineOperand &DefMO = It->getOperand(0);
+    if (!DefMO.isReg() || !DefMO.isDef() || DefMO.getReg() != SrcReg)
+      continue;
+    if (DefMO.getSubReg() != NeedSubIdx)
+      continue;
+    const MachineOperand &SrcMO = It->getOperand(1);
+    if (!SrcMO.isReg())
+      return false;
+    OutReg = SrcMO.getReg();
+    OutSub = SrcMO.getSubReg();
+    return true;
+  }
+  return false;
+}
+
+static void addOperandAndMods(MachineInstrBuilder &NewMI, unsigned SrcMods,
+                              bool IsHiBits, const MachineOperand &SrcMO,
+                              Register DstReg, Register LaneSrcBase,
+                              unsigned DstPackSub, MachineInstr &VPK,
+                              const SIRegisterInfo *TRI) {
+  unsigned NewSrcMods = 0;
+  unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
+  unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : 
SISrcMods::OP_SEL_0;
+  if (SrcMods & NegModifier)
+    NewSrcMods |= SISrcMods::NEG;
+  NewMI.addImm(NewSrcMods);
+  if (SrcMO.isImm()) {
+    NewMI.addImm(SrcMO.getImm());
+    return;
+  }
+  Register OrigSrcReg = SrcMO.getReg();
+  unsigned SrcPackSub = SrcMO.getSubReg();
+
+  const bool UseHiOfPair = (SrcMods & OpSelModifier) != 0;
+  const unsigned PairLane = UseHiOfPair ? AMDGPU::sub1 : AMDGPU::sub0;
+
+  // Packed subreg on the operand (e.g. sub6_sub7); when src is the same
+  // super-register as dst (dst-as-src), MO may omit the subreg — use the V_PK
+  // destination packed subreg.
+  unsigned BasePack = SrcPackSub;
+  if (!BasePack && OrigSrcReg == DstReg)
+    BasePack = DstPackSub;
+
+  unsigned FinalSubIdx =
+      BasePack ? TRI->composeSubRegIndices(BasePack, PairLane) : PairLane;
+
+  Register SrcReg = OrigSrcReg;
+  if (OrigSrcReg.isVirtual() && OrigSrcReg != DstReg) {
+    Register FoldReg;
+    unsigned FoldSub = 0;
+    if (tryFoldThroughSameBlockCopyDef(VPK, OrigSrcReg, FinalSubIdx, FoldReg,
+                                       FoldSub) &&
+        FoldSub) {
+      // COPY source subreg is the exact read for this packed lane; use it
+      // directly for scalar lanes (<=32b). Wider composed subregs need
+      // compose(FoldSub, PairLane).
+      SrcReg = FoldReg;
+      const unsigned Sz = TRI->getSubRegIdxSize(FoldSub);
+      if (Sz <= 32)
+        FinalSubIdx = FoldSub;
+      else if (unsigned Composed = TRI->composeSubRegIndices(FoldSub, 
PairLane))
+        FinalSubIdx = Composed;
+      else
+        FinalSubIdx = FoldSub;
+    }
+  }
+
+  if (OrigSrcReg == DstReg)
+    SrcReg = LaneSrcBase;
+
+  bool KillState = false;
+  if (SrcMO.isKill()) {
+    bool OpSel = SrcMods & SISrcMods::OP_SEL_0;
+    bool OpSelHi = SrcMods & SISrcMods::OP_SEL_1;
+    KillState = true;
+    if ((OpSel == OpSelHi) && !IsHiBits)
+      KillState = false;
+  }
+  if (SrcReg.isPhysical()) {
+    Register Phys = TRI->getSubReg(SrcReg, FinalSubIdx);
+    if (KillState)
+      NewMI.addReg(Phys, RegState::Kill);
+    else
+      NewMI.addReg(Phys);
+  } else {
+    if (KillState)
+      NewMI.addReg(SrcReg, RegState::Kill, FinalSubIdx);
+    else
+      NewMI.addReg(SrcReg, {}, FinalSubIdx);
+  }
+}
+
+static MachineInstrBuilder
+createUnpackedMI(MachineInstr &I, const SIInstrInfo *TII,
+                 const SIRegisterInfo *TRI, uint32_t UnpackedOpcode,
+                 bool IsHiBits, Register DstReg, unsigned DefSubIdx,
+                 Register LaneSrcBase, unsigned DstPackSub, bool UndefOnDef) {
+  MachineBasicBlock &MBB = *I.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+  const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
+  const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
+  unsigned OpCode = I.getOpcode();
+
+  int64_t ClampVal = TII->getNamedOperand(I, AMDGPU::OpName::clamp)->getImm();
+  unsigned Src0Mods =
+      TII->getNamedOperand(I, AMDGPU::OpName::src0_modifiers)->getImm();
+  unsigned Src1Mods =
+      TII->getNamedOperand(I, AMDGPU::OpName::src1_modifiers)->getImm();
+
+  MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
+  NewMI.addDef(DstReg, RegState::Define, DefSubIdx);
+  if (UndefOnDef)
+    NewMI->getOperand(0).setIsUndef(true);
+  addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0, DstReg, LaneSrcBase,
+                    DstPackSub, I, TRI);
+  addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1, DstReg, LaneSrcBase,
+                    DstPackSub, I, TRI);
+
+  if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
+    const MachineOperand *SrcMO2 =
+        TII->getNamedOperand(I, AMDGPU::OpName::src2);
+    unsigned Src2Mods =
+        TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
+    addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2, DstReg, LaneSrcBase,
+                      DstPackSub, I, TRI);
+  }
+  NewMI.addImm(ClampVal);
+  NewMI.addImm(0);
+  return NewMI;
+}
+
+static void
+recomputeIntervalsAfterVirtualUnpack(ArrayRef<MachineInstr *> MIs,
+                                     LiveIntervals &LIS,
+                                     ArrayRef<Register> ExtraRegs = {}) {
+  SmallVector<Register, 16> Regs;
+  for (MachineInstr *MI : MIs) {
+    if (!MI)
+      continue;
+    for (MachineOperand &MO : MI->operands()) {
+      if (MO.isReg() && MO.getReg().isVirtual())
+        Regs.push_back(MO.getReg());
+    }
+  }
+  Regs.append(ExtraRegs.begin(), ExtraRegs.end());
+  llvm::sort(Regs);
+  Regs.erase(llvm::unique(Regs), Regs.end());
+  for (Register R : Regs) {
+    if (LIS.hasInterval(R))
+      LIS.removeInterval(R);
+    LIS.createAndComputeVirtRegInterval(R);
+  }
+}
+
+/// Erase COPY defs of \p R in \p MBB when \p R has no non-debug uses (e.g. 
temp
+/// only fed this V_PK). Collect virtual regs touched for LIS recomputation.
+static void eraseRedundantCopyDefsForRegIfUnused(
+    Register R, MachineBasicBlock &MBB, MachineRegisterInfo &MRI,
+    LiveIntervals &LIS, SmallVectorImpl<Register> &RegsToRecompute) {
+  if (!R.isVirtual() || !MRI.use_nodbg_empty(R))
+    return;
+
+  SmallVector<MachineInstr *, 8> ToErase;
+  for (MachineInstr &MI : MBB) {
+    if (MI.getOpcode() != AMDGPU::COPY)
+      continue;
+    MachineOperand &Def = MI.getOperand(0);
+    if (!Def.isReg() || !Def.isDef() || Def.getReg() != R)
+      continue;
+    ToErase.push_back(&MI);
+  }
+  for (MachineInstr *MI : ToErase) {
+    for (MachineOperand &MO : MI->operands()) {
+      if (MO.isReg() && MO.getReg().isVirtual())
+        RegsToRecompute.push_back(MO.getReg());
+    }
+    LIS.RemoveMachineInstrFromMaps(*MI);
+    MI->eraseFromParent();
+  }
+}
+
+/// True iff \p A is ordered before \p B in the same block (linear scan).
+static bool instrIsBeforeInSameBB(const MachineInstr *A,
+                                  const MachineInstr *B) {
+  assert(A->getParent() == B->getParent());
+  for (const MachineInstr &MI : *A->getParent()) {
+    if (&MI == A)
+      return true;
+    if (&MI == B)
+      return false;
+  }
+  llvm_unreachable("instructions not in same block");
+}
+
+static LaneBitmask laneMaskForRegOperand(const MachineOperand &MO, Register R,
+                                         const MachineRegisterInfo &MRI,
+                                         const SIRegisterInfo &TRI) {
+  if (!MO.isReg() || MO.getReg() != R)
+    return LaneBitmask::getNone();
+  const TargetRegisterClass *RC = MRI.getRegClass(R);
+  unsigned Sub = MO.getSubReg();
+  if (Sub)
+    return TRI.getSubRegIndexLaneMask(Sub);
+  // No subreg: operand names the full virtual register. RC->getLaneMask() can
+  // equal a single 32-bit lane for 64-bit vreg_64 classes; OR in the paired
+  // lane so full-reg reads overlap sub1 when querying sub1's mask.
+  LaneBitmask M = RC->getLaneMask();
+  if (TRI.getRegSizeInBits(*RC) == 64) {
+    LaneBitmask S0 = TRI.getSubRegIndexLaneMask(AMDGPU::sub0);
+    LaneBitmask S1 = TRI.getSubRegIndexLaneMask(AMDGPU::sub1);
+    if (M == S0 || M == S1)
+      return S0 | S1;
+  }
+  return M;
+}
+
+/// True if \p Blocker defines \p R on a lane that overlaps the COPY source or
+/// (non-COPY) destination lane, so the same-vreg COPY cannot be sunk past it.
+static bool instrBlocksSinkOfSameVRegCopy(const MachineInstr &Blocker,
+                                          const MachineInstr &CopyMI,
+                                          Register R, LaneBitmask SMask,
+                                          LaneBitmask DMask,
+                                          const MachineRegisterInfo &MRI,
+                                          const SIRegisterInfo &TRI) {
+  for (const MachineOperand &MO : Blocker.all_defs()) {
+    if (!MO.isReg() || MO.getReg() != R)
+      continue;
+    unsigned Sub = MO.getSubReg();
+    const TargetRegisterClass *RC = MRI.getRegClass(R);
+    LaneBitmask FullMask = RC->getLaneMask();
+    LaneBitmask DefM = Sub ? TRI.getSubRegIndexLaneMask(Sub) : FullMask;
+    if ((DefM & SMask).any())
+      return true;
+    if ((DefM & DMask).any() && &Blocker != &CopyMI)
+      return true;
+  }
+  return false;
+}
+
+/// First instruction in the same MBB strictly after \p CopyMI in program 
order,
+/// before \p SearchEnd (exclusive), that has an operand on \p R touching a
+/// lane overlapping \p DMask.
+static MachineInstr *findFirstDependentUseAfterSameVRegCopy(
+    MachineInstr &CopyMI, Register R, LaneBitmask DMask,
+    const MachineRegisterInfo &MRI, const SIRegisterInfo &TRI,
+    MachineBasicBlock::iterator SearchEnd) {
+  for (auto It = std::next(CopyMI.getIterator()); It != SearchEnd; ++It) {
+    MachineInstr &MI = *It;
+    for (MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg() || MO.getReg() != R || MO.isDebug())
+        continue;
+      // Ignore operands that do not read the register (e.g. S_NOP may carry
+      // reg operands that are not real uses for liveness).
+      if (!MO.readsReg())
+        continue;
+      LaneBitmask M = laneMaskForRegOperand(MO, R, MRI, TRI);
+      if ((M & DMask).none())
+        continue;
+      return &MI;
+    }
+  }
+  return nullptr;
+}
+
+/// Sink COPY %R:DstSub = %R:SrcSub toward its first dependent use, without
+/// crossing \p RegionEnd (the exclusive end of the current schedule region —
+/// e.g. the next scheduling boundary). If the first dependent use is in a 
later
+/// region, sink to immediately before \p RegionEnd instead.
+static bool trySinkSameVRegSubregCopy(MachineInstr &CopyMI,
+                                      MachineRegisterInfo &MRI,
+                                      LiveIntervals &LIS,
+                                      const SIRegisterInfo &TRI,
+                                      MachineBasicBlock::iterator RegionEnd) {
+  if (CopyMI.getOpcode() != AMDGPU::COPY || CopyMI.getNumOperands() < 2)
+    return false;
+  if (CopyMI.isBundled())
+    return false;
+
+  MachineOperand &DefMO = CopyMI.getOperand(0);
+  MachineOperand &SrcMO = CopyMI.getOperand(1);
+  if (!DefMO.isReg() || !DefMO.isDef() || !SrcMO.isReg())
+    return false;
+  if (!SrcMO.readsReg())
+    return false;
+  Register R = DefMO.getReg();
+  if (R != SrcMO.getReg() || !R.isVirtual())
+    return false;
+  unsigned DSub = DefMO.getSubReg();
+  unsigned SSub = SrcMO.getSubReg();
+  if (!DSub || !SSub || DSub == SSub)
+    return false;
+
+  LaneBitmask DMask = TRI.getSubRegIndexLaneMask(DSub);
+  LaneBitmask SMask = TRI.getSubRegIndexLaneMask(SSub);
+
+  MachineBasicBlock *MBB = CopyMI.getParent();
+  MachineBasicBlock::iterator CopyIt = CopyMI.getIterator();
+
+  if (std::next(CopyIt) == RegionEnd)
+    return false;
+
+  MachineInstr *FirstUseInRegion = findFirstDependentUseAfterSameVRegCopy(
+      CopyMI, R, DMask, MRI, TRI, RegionEnd);
+
+  MachineBasicBlock::iterator InsertPt;
+
+  if (FirstUseInRegion) {
+    MachineBasicBlock::iterator FirstUseIt = FirstUseInRegion->getIterator();
+    InsertPt = FirstUseIt;
+    for (MachineBasicBlock::iterator It = std::next(CopyIt); It != FirstUseIt;
+         ++It) {
+      if (instrBlocksSinkOfSameVRegCopy(*It, CopyMI, R, SMask, DMask, MRI,
+                                        TRI)) {
+        InsertPt = It;
+        break;
+      }
+    }
+  } else {
+    // Dependent use is only past this schedule region — sink to the region
+    // end (before RegionEnd), not to the first use in the next region.
+    if (RegionEnd == MBB->end())
+      return false;
+    MachineInstr *LaterUse = findFirstDependentUseAfterSameVRegCopy(
+        CopyMI, R, DMask, MRI, TRI, MBB->end());
+    if (!LaterUse)
+      return false;
+    InsertPt = RegionEnd;
+    for (MachineBasicBlock::iterator It = std::next(CopyIt); It != RegionEnd;
+         ++It) {
+      if (instrBlocksSinkOfSameVRegCopy(*It, CopyMI, R, SMask, DMask, MRI,
+                                        TRI)) {
+        InsertPt = It;
+        break;
+      }
+    }
+  }
+
+  if (InsertPt == std::next(CopyIt))
+    return false;
+
+  CopyMI.moveBefore(&*InsertPt);
+  LIS.handleMove(CopyMI);
+  return true;
+}
+
+/// COPY DstReg:DstSub = SrcReg:SrcSub with DstReg==SrcReg (virtual): redirect
+/// explicit uses of DstSub to SrcSub and remove the COPY.
+///
+/// Does not remove the COPY when any use reads the destination lane without an
+/// explicit DstSub (e.g. full-register %v:vreg_64) — those operands cannot be
+/// rewritten to SrcSub without changing semantics (sub1 would become undef).
+static bool tryFoldSameVRegSubregCopy(MachineInstr &CopyMI,
+                                      MachineRegisterInfo &MRI,
+                                      LiveIntervals &LIS,
+                                      const SIRegisterInfo &TRI) {
+  if (CopyMI.getOpcode() != AMDGPU::COPY || CopyMI.getNumOperands() < 2)
+    return false;
+  MachineOperand &DefMO = CopyMI.getOperand(0);
+  MachineOperand &SrcMO = CopyMI.getOperand(1);
+  if (!DefMO.isReg() || !DefMO.isDef() || !SrcMO.isReg())
+    return false;
+  if (!SrcMO.readsReg())
+    return false;
+  Register R = DefMO.getReg();
+  if (R != SrcMO.getReg() || !R.isVirtual())
+    return false;
+  unsigned DSub = DefMO.getSubReg();
+  unsigned SSub = SrcMO.getSubReg();
+  if (!DSub || !SSub || DSub == SSub)
+    return false;
+
+  LaneBitmask DMask = TRI.getSubRegIndexLaneMask(DSub);
+
+  SmallVector<MachineOperand *, 16> UseOps;
+  for (MachineOperand &MO : MRI.use_nodbg_operands(R)) {
+    if (!MO.readsReg())
+      continue;
+    if (MO.getParent() == &CopyMI)
+      continue;
+
+    LaneBitmask M = laneMaskForRegOperand(MO, R, MRI, TRI);
+    if ((M & DMask).none())
+      continue;
+
+    unsigned Sub = MO.getSubReg();
+
+    if (Sub != DSub)
+      return false;
+
+    MachineInstr *UseMI = MO.getParent();
+    if (UseMI->getParent() != CopyMI.getParent())
+      return false;
+    if (!instrIsBeforeInSameBB(&CopyMI, UseMI))
+      return false;
+
+    UseOps.push_back(&MO);
+  }
+
+  for (MachineOperand *MO : UseOps)
+    MO->setSubReg(SSub);
+
+  LIS.RemoveMachineInstrFromMaps(CopyMI);
+  CopyMI.eraseFromParent();
+
+  if (LIS.hasInterval(R)) {
+    LIS.removeInterval(R);
+    LIS.createAndComputeVirtRegInterval(R);
+  }
+  return true;
+}
+
+/// Walk [Begin, End) and fold / sink same-vreg subreg COPYs (iterator-safe).
+static bool cleanupSameVRegSubregCopiesInRange(
+    MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
+    MachineRegisterInfo &MRI, LiveIntervals &LIS, const SIRegisterInfo &TRI) {
+  bool Changed = false;
+  SmallVector<MachineInstr *, 16> Copies;
+  for (auto I = Begin; I != End; ++I) {
+    if (I->getOpcode() == AMDGPU::COPY)
+      Copies.push_back(&*I);
+  }
+  for (MachineInstr *MI : Copies) {
+    if (tryFoldSameVRegSubregCopy(*MI, MRI, LIS, TRI))
+      Changed = true;
+  }
+  Copies.clear();
+  for (auto I = Begin; I != End; ++I) {
+    if (I->getOpcode() == AMDGPU::COPY)
+      Copies.push_back(&*I);
+  }
+  for (MachineInstr *MI : Copies) {
+    if (trySinkSameVRegSubregCopy(*MI, MRI, LIS, TRI, End))
+      Changed = true;
+  }
+  return Changed;
+}
+
+static bool performF32Unpacking(MachineInstr &I, const SIInstrInfo *TII,
+                                const SIRegisterInfo *TRI, LiveIntervals &LIS,
+                                DenseSet<Register> *InvolvedUnpackRegs) {
+  uint32_t UnpackedOpcode = mapToUnpackedOpcode(I);
+  if (UnpackedOpcode == std::numeric_limits<uint32_t>::max()) {
+    debugSkipVPKUnpack(I, TII, TRI, "unsupported_opcode");
+    return false;
+  }
+  if (canUnpackingClobberRegister(I, TII, TRI)) {
+    debugSkipVPKUnpack(I, TII, TRI, "would_clobber_overlapping_src");
+    return false;
+  }
+
+  const MachineOperand &DstMO = I.getOperand(0);
+  Register DstReg = DstMO.getReg();
+  if (!DstReg.isVirtual()) {
+    debugSkipVPKUnpack(I, TII, TRI, "physical_dst");
+    return false;
+  }
+
+  unsigned DstSub = DstMO.getSubReg();
+
+  MachineFunction *MF = I.getMF();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
+
+  // Lane indices for subreg defs: full vreg_64 uses sub0/sub1. For composite
+  // 64-bit packed destinations (sub0_sub1, sub2_sub3, sub4_sub5, sub6_sub7,
+  // … on wide vectors), decompose with composeSubRegIndices — not getSubReg,
+  // whose first operand is an MCRegister, not a SubRegIndex (SIInstrInfo uses
+  // the same compose pattern for partial subregs).
+  unsigned LoSeqIdx = DstSub ? TRI->composeSubRegIndices(DstSub, AMDGPU::sub0)
+                             : static_cast<unsigned>(AMDGPU::sub0);
+  unsigned HiSeqIdx = DstSub ? TRI->composeSubRegIndices(DstSub, AMDGPU::sub1)
+                             : static_cast<unsigned>(AMDGPU::sub1);
+  if (DstSub && (!LoSeqIdx || !HiSeqIdx)) {
+    debugSkipVPKUnpack(I, TII, TRI, "bad_composite_subreg_decompose", LoSeqIdx,
+                       HiSeqIdx);
+    return false;
+  }
+
+  const bool LoRCValid = TRI->isSubRegValidForRegClass(DstRC, LoSeqIdx);
+  const bool HiRCValid = TRI->isSubRegValidForRegClass(DstRC, HiSeqIdx);
+  if (!LoRCValid || !HiRCValid) {
+    debugSkipVPKUnpack(I, TII, TRI, "invalid_subreg_for_regclass", LoSeqIdx,
+                       HiSeqIdx, LoRCValid, HiRCValid);
+    return false;
+  }
+
+  Register LaneSrcBase = DstReg;
+
+  const bool UndefOnFirstLane = !vpkAnySrcUsesDst(I, TII, DstReg) &&
+                                !hasExplicitDefOfRegBefore(I, DstReg);
+
+  SmallVector<Register, 4> CleanCandidates;
+  auto AddSrcCandidate = [&](const MachineOperand *MO) {
+    if (!MO || !MO->isReg())
+      return;
+    Register R = MO->getReg();
+    if (!R.isVirtual() || R == DstReg)
+      return;
+    if (!isVirtualSrcRegOnlyUsedByThisVPK(R, I, MRI))
+      return;
+    CleanCandidates.push_back(R);
+  };
+  AddSrcCandidate(TII->getNamedOperand(I, AMDGPU::OpName::src0));
+  AddSrcCandidate(TII->getNamedOperand(I, AMDGPU::OpName::src1));
+  if (AMDGPU::hasNamedOperand(I.getOpcode(), AMDGPU::OpName::src2))
+    AddSrcCandidate(TII->getNamedOperand(I, AMDGPU::OpName::src2));
+  llvm::sort(CleanCandidates);
+  CleanCandidates.erase(llvm::unique(CleanCandidates), CleanCandidates.end());
+
+  SmallVector<Register, 4> SrcRegsForInvolvedSet;
+  auto AddInvolvedSrc = [&](const MachineOperand *MO) {
+    if (!MO || !MO->isReg())
+      return;
+    Register R = MO->getReg();
+    if (R.isVirtual())
+      SrcRegsForInvolvedSet.push_back(R);
+  };
+  AddInvolvedSrc(TII->getNamedOperand(I, AMDGPU::OpName::src0));
+  AddInvolvedSrc(TII->getNamedOperand(I, AMDGPU::OpName::src1));
+  if (AMDGPU::hasNamedOperand(I.getOpcode(), AMDGPU::OpName::src2))
+    AddInvolvedSrc(TII->getNamedOperand(I, AMDGPU::OpName::src2));
+
+  MachineBasicBlock *MBB = I.getParent();
+
+  LIS.RemoveMachineInstrFromMaps(I);
+
+  // BuildMI(MBB, I, ...) prepends each instruction immediately before I, so 
the
+  // first BuildMI call ends up furthest from I in program order: low lane, 
then
+  // high lane.
+  MachineInstrBuilder Op0L = createUnpackedMI(I, TII, TRI, UnpackedOpcode,
+                                              /*IsHiBits=*/false, DstReg,
+                                              LoSeqIdx, LaneSrcBase, DstSub,
+                                              /*UndefOnDef=*/UndefOnFirstLane);
+  LIS.InsertMachineInstrInMaps(*Op0L);
+  MachineInstrBuilder Op0H =
+      createUnpackedMI(I, TII, TRI, UnpackedOpcode,
+                       /*IsHiBits=*/true, DstReg, HiSeqIdx, LaneSrcBase, 
DstSub,
+                       /*UndefOnDef=*/false);
+  LIS.InsertMachineInstrInMaps(*Op0H);
+
+  uint32_t IFlags = I.getFlags();
+  Op0L->setFlags(IFlags);
+  Op0H->setFlags(IFlags);
+
+  I.eraseFromParent();
+
+  MF->getProperties().reset(MachineFunctionProperties::Property::IsSSA);
+
+  SmallVector<Register, 16> ExtraRegs;
+  for (Register R : CleanCandidates)
+    eraseRedundantCopyDefsForRegIfUnused(R, *MBB, MRI, LIS, ExtraRegs);
+
+  SmallVector<MachineInstr *, 5> ToRecompute;
+  ToRecompute.push_back(&*Op0L);
+  ToRecompute.push_back(&*Op0H);
+  recomputeIntervalsAfterVirtualUnpack(ToRecompute, LIS, ExtraRegs);
+  if (InvolvedUnpackRegs) {
+    InvolvedUnpackRegs->insert(DstReg);
+    for (Register R : SrcRegsForInvolvedSet)
+      InvolvedUnpackRegs->insert(R);
+  }
+  return true;
+}
+
+/// True if \p R is a 64-bit VGPR/AGPR virtual register and every register
+/// operand uses only \p sub0 or \p sub1 (no full-register or other subregs).
+static bool is64BitOnlySub0Sub1(Register R, const MachineRegisterInfo &MRI,
+                                const SIRegisterInfo &TRI) {
+  if (!R.isVirtual())
+    return false;
+  const TargetRegisterClass *RC = MRI.getRegClass(R);
+  if (!RC || TRI.getRegSizeInBits(*RC) != 64)
+    return false;
+  if (!TRI.isVGPR(MRI, R) && !TRI.isAGPR(MRI, R))
+    return false;
+
+  bool Any = false;
+  for (const MachineOperand &MO : MRI.reg_operands(R)) {
+    Any = true;
+    unsigned S = MO.getSubReg();
+    if (S == AMDGPU::NoSubRegister)
+      return false;
+    if (S != AMDGPU::sub0 && S != AMDGPU::sub1)
+      return false;
+  }
+  return Any;
+}
+
+static bool split64BitOnlySub0Sub1ToV32(Register R, MachineRegisterInfo &MRI,
+                                        const SIRegisterInfo &TRI,
+                                        LiveIntervals &LIS) {
+  const TargetRegisterClass *RC = MRI.getRegClass(R);
+  const TargetRegisterClass *LoRC = TRI.getSubRegisterClass(RC, AMDGPU::sub0);
+  const TargetRegisterClass *HiRC = TRI.getSubRegisterClass(RC, AMDGPU::sub1);
+  if (!LoRC || !HiRC)
+    return false;
+
+  bool HasLo = false, HasHi = false;
+  for (const MachineOperand &MO : MRI.reg_operands(R)) {
+    unsigned S = MO.getSubReg();
+    if (S == AMDGPU::sub0)
+      HasLo = true;
+    else if (S == AMDGPU::sub1)
+      HasHi = true;
+  }
+  if (!HasLo && !HasHi)
+    return false;
+
+  Register NewLo, NewHi;
+  if (HasLo)
+    NewLo = MRI.createVirtualRegister(LoRC);
+  if (HasHi)
+    NewHi = MRI.createVirtualRegister(HiRC);
+
+  SmallVector<MachineOperand *, 32> Ops;
+  for (MachineOperand &MO : MRI.reg_operands(R))
+    Ops.push_back(&MO);
+
+  for (MachineOperand *MO : Ops) {
+    unsigned S = MO->getSubReg();
+    Register NR = (S == AMDGPU::sub0) ? NewLo : NewHi;
+    MO->setReg(NR);
+    MO->setSubReg(AMDGPU::NoSubRegister);
+    // Partial subreg defs used UndefOnDef for the wide vreg; each new vreg_32
+    // is fully written by its instruction — drop stale undef on defs.
+    if (MO->isDef())
+      MO->setIsUndef(false);
+  }
+
+  if (LIS.hasInterval(R))
+    LIS.removeInterval(R);
+  if (HasLo)
+    LIS.createAndComputeVirtRegInterval(NewLo);
+  if (HasHi)
+    LIS.createAndComputeVirtRegInterval(NewHi);
+
+  LLVM_DEBUG({
+    dbgs() << DEBUG_TYPE << ": post-unpack split " << printReg(R, &TRI)
+           << " -> ";
+    if (HasLo)
+      dbgs() << printReg(NewLo, &TRI) << '(' << TRI.getRegClassName(LoRC)
+             << ')';
+    if (HasLo && HasHi)
+      dbgs() << ", ";
+    if (HasHi)
+      dbgs() << printReg(NewHi, &TRI) << '(' << TRI.getRegClassName(HiRC)
+             << ')';
+    dbgs() << '\n';
+  });
+
+  return true;
+}
+
+/// Split 64-bit VGPR/AGPR vregs that only use \p sub0 / \p sub1 into 32-bit
+/// vregs. Only considers virtual registers that appeared as a V_PK
+/// destination or as a virtual src0/src1/src2 on a successful unpack in this
+/// pass (\p InvolvedUnpackRegs). Only the first \p MaxRegsToProcess candidates
+/// in virtual-register index order are considered (UINT_MAX = all).
+static bool postUnpackSplit64BitSubregsToV32(
+    MachineFunction &MF, LiveIntervals &LIS, const SIRegisterInfo &TRI,
+    unsigned MaxRegsToProcess, const DenseSet<Register> &InvolvedUnpackRegs) {
+  assert(MaxRegsToProcess > 0);
+  if (InvolvedUnpackRegs.empty())
+    return false;
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const unsigned NumVR = MRI.getNumVirtRegs();
+  SmallVector<Register, 64> Candidates;
+  Candidates.reserve(InvolvedUnpackRegs.size());
+  for (unsigned I = 0; I < NumVR; ++I) {
+    Register R = Register::index2VirtReg(I);
+    if (!InvolvedUnpackRegs.contains(R))
+      continue;
+    if (is64BitOnlySub0Sub1(R, MRI, TRI))
+      Candidates.push_back(R);
+  }
+  llvm::sort(Candidates, [](Register A, Register B) {
+    return A.virtRegIndex() < B.virtRegIndex();
+  });
+
+  const bool Unlimited =
+      MaxRegsToProcess == std::numeric_limits<unsigned>::max();
+
+  bool Changed = false;
+  unsigned Seen = 0;
+  for (Register R : Candidates) {
+    if (!Unlimited && Seen >= MaxRegsToProcess)
+      break;
+    ++Seen;
+    if (split64BitOnlySub0Sub1ToV32(R, MRI, TRI, LIS))
+      Changed = true;
+  }
+  return Changed;
+}
+
+class AMDGPUIGLPUnpackImpl {
+  LiveIntervals *LIS;
+  const SIInstrInfo *TII = nullptr;
+  const SIRegisterInfo *TRI = nullptr;
+
+public:
+  explicit AMDGPUIGLPUnpackImpl(LiveIntervals *L) : LIS(L) {}
+
+  bool run(MachineFunction &MF);
+};
+
+bool AMDGPUIGLPUnpackImpl::run(MachineFunction &MF) {
+  if (!LIS)
+    return false;
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+  const MCInstrInfo &II = *TII;
+
+  SmallVector<
+      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 8>
+      SubRegions;
+
+  bool Changed = false;
+  bool AnyVPKUnpacked = false;
+  DenseSet<Register> InvolvedUnpackRegs;
+
+  for (MachineBasicBlock &MBB : MF) {
+    splitMBBBySchedBarriers(MBB, *TII, SubRegions);
+
+    for (auto [Beg, End] : SubRegions) {
+      CandidateRegion CR;
+      if (!findCandidateRegion(MBB, Beg, End, II, CR)) {
+        LLVM_DEBUG({
+          unsigned NumVPK = 0;
+          bool HasValuSpacingIGLP = false;
+          for (auto It = Beg; It != End; ++It) {
+            if (isVPKOpcode(II, It->getOpcode()))
+              ++NumVPK;
+            if (It->getOpcode() == AMDGPU::IGLP_OPT &&
+                It->getNumOperands() >= 1 && It->getOperand(0).isImm() &&
+                It->getOperand(0).getImm() ==
+                    static_cast<int64_t>(
+                        AMDGPU::IGLPStrategyID::MFMAValuSpacingOptID))
+              HasValuSpacingIGLP = true;
+          }
+          if (NumVPK > 0 && kRequireMFMAValuSpacingIGLPOptForRegion &&
+              !HasValuSpacingIGLP)
+            dbgs() << DEBUG_TYPE << ": skip region MBB#" << MBB.getNumber()
+                   << " (" << NumVPK << " V_PK, missing IGLP_OPT imm="
+                   << static_cast<int>(
+                          AMDGPU::IGLPStrategyID::MFMAValuSpacingOptID)
+                   << ")\n";
+        });
+        continue;
+      }
+
+      if (schedRegionHasExplicitAllocatablePhysReg(Beg, End, MF.getRegInfo())) 
{
+        LLVM_DEBUG({
+          dbgs() << DEBUG_TYPE << ": skip region MBB#" << MBB.getNumber()
+                 << " (explicit allocatable physical register operand)\n";
+        });
+        continue;
+      }
+
+      const unsigned MaxVPK = AMDGPUIGLPUnpackMaxVPKPerRegion;
+      LLVM_DEBUG({
+        dbgs() << DEBUG_TYPE << ": " << MF.getName() << " MBB#"
+               << MBB.getNumber()
+               << " region instrs=" << std::distance(Beg, End)
+               << " v_pk=" << CR.VPKInsts.size();
+        if (MaxVPK > 0)
+          dbgs() << " max_vpk_unpack=" << MaxVPK;
+        dbgs() << "\n";
+      });
+
+      unsigned UnpackSlot = 0;
+      for (MachineInstr *MI : CR.VPKInsts) {
+        if (MaxVPK > 0 && UnpackSlot >= MaxVPK) {
+          LLVM_DEBUG({
+            dbgs() << DEBUG_TYPE << ": skip remaining V_PK in region (limit "
+                   << MaxVPK << ")\n";
+          });
+          break;
+        }
+        ++UnpackSlot;
+        if (performF32Unpacking(*MI, TII, TRI, *LIS, &InvolvedUnpackRegs)) {
+          Changed = true;
+          AnyVPKUnpacked = true;
+        }
+      }
+
+      if (cleanupSameVRegSubregCopiesInRange(CR.Begin, CR.End, MF.getRegInfo(),
+                                             *LIS, *TRI))
+        Changed = true;
+    }
+  }
+
+  if (AnyVPKUnpacked) {
+    const unsigned PostMax = AMDGPUIGLPUnpackPostCleanupMaxVRegs;
+    if (PostMax > 0) {
+      if (postUnpackSplit64BitSubregsToV32(MF, *LIS, *TRI, PostMax,
+                                           InvolvedUnpackRegs))
+        Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+class AMDGPUIGLPUnpackLegacy : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUIGLPUnpackLegacy() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+    return AMDGPUIGLPUnpackImpl(&LIS).run(MF);
+  }
+
+  StringRef getPassName() const override { return "AMDGPU IGLP unpack"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // Like GCNRewritePartialRegUses: preserving LIS/SlotIndexes avoids the
+    // legacy PM recomputing them before MachineScheduler when we make no MIR
+    // changes.
+    AU.setPreservesCFG();
+    AU.addRequired<LiveIntervalsWrapperPass>();
+    AU.addPreserved<LiveIntervalsWrapperPass>();
+    AU.addPreserved<SlotIndexesWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // namespace
+
+char AMDGPUIGLPUnpackLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AMDGPUIGLPUnpackLegacy, DEBUG_TYPE, "AMDGPU IGLP unpack",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
+INITIALIZE_PASS_END(AMDGPUIGLPUnpackLegacy, DEBUG_TYPE, "AMDGPU IGLP unpack",
+                    false, false)
+
+char &llvm::AMDGPUIGLPUnpackID = AMDGPUIGLPUnpackLegacy::ID;
+
+PreservedAnalyses
+AMDGPUIGLPUnpackPass::run(MachineFunction &MF,
+                          MachineFunctionAnalysisManager &MFAM) {
+  LiveIntervals &LIS = MFAM.getResult<LiveIntervalsAnalysis>(MF);
+  AMDGPUIGLPUnpackImpl Impl(&LIS);
+  if (!Impl.run(MF))
+    return PreservedAnalyses::all();
+
+  auto PA = getMachineFunctionPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index d49ec90e4c212..cf8caebf08813 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -819,13 +819,6 @@ void PipelineSolver::solve() {
   LLVM_DEBUG(DAG->dump());
 }
 
-enum IGLPStrategyID : int {
-  MFMASmallGemmOptID = 0,
-  MFMASmallGemmSingleWaveOptID = 1,
-  MFMAExpInterleaveID = 2,
-  MFMAExpSimpleInterleaveID = 3
-};
-
 // Implement a IGLP scheduling strategy.
 class IGLPStrategy {
 protected:
@@ -896,6 +889,76 @@ bool MFMASmallGemmOpt::applyIGLPStrategy(
   return true;
 }
 
+static bool isMFMAValuSpacingGapValu(const MachineInstr &MI,
+                                     const SIInstrInfo *TII) {
+  if (MI.isMetaInstruction())
+    return false;
+  return TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI) &&
+         !MI.mayLoadOrStore();
+}
+
+/// Interleave MFMA/WMMA with VALU slots: each repeating stage is one MFMA (or
+/// WMMA), then up to N VALU ops per gap where N = floor(#VALU / #MFMA) in this
+/// schedule region (same predicate as \c isMFMAValuSpacingGapValu), at least 
1.
+/// Template length uses MFMACount * 3 for slack, like MFMASmallGemmOpt.
+/// \p IsBottomUp is false so SchedGroup pipeline order matches forward program
+/// order (MFMA before its VALU gap).
+class MFMAValuSpacingOpt final : public IGLPStrategy {
+public:
+  bool applyIGLPStrategy(
+      DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+      AMDGPU::SchedulingPhase Phase) override;
+
+  bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
+                           AMDGPU::SchedulingPhase Phase) override {
+    for (const MachineInstr &I : *DAG)
+      if (TII->isMFMAorWMMA(I))
+        return true;
+    return false;
+  }
+
+  MFMAValuSpacingOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
+      : IGLPStrategy(DAG, TII) {
+    IsBottomUp = false;
+  }
+};
+
+bool MFMAValuSpacingOpt::applyIGLPStrategy(
+    DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+    AMDGPU::SchedulingPhase Phase) {
+  unsigned MFMACount = 0;
+  unsigned ValuCount = 0;
+  for (const MachineInstr &I : *DAG) {
+    if (TII->isMFMAorWMMA(I))
+      ++MFMACount;
+    else if (isMFMAValuSpacingGapValu(I, TII))
+      ++ValuCount;
+  }
+
+  unsigned ValuGap = 1;
+  if (MFMACount > 0) {
+    ValuGap = ValuCount / MFMACount;
+    if (ValuGap < 1)
+      ValuGap = 1;
+  }
+
+  const unsigned PipelineSyncID = 0;
+  SchedGroup *SG = nullptr;
+  for (unsigned I = 0; I < MFMACount * 3; ++I) {
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+    SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
+
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::VALU, ValuGap, PipelineSyncID, DAG, TII);
+    SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
+  }
+
+  return true;
+}
+
 class MFMAExpInterleaveOpt final : public IGLPStrategy {
 private:
   // The count of TRANS SUs involved in the interleaved pipeline
@@ -2316,17 +2379,19 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 }
 
 static std::unique_ptr<IGLPStrategy>
-createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
+createIGLPStrategy(AMDGPU::IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
                    const SIInstrInfo *TII) {
   switch (ID) {
-  case MFMASmallGemmOptID:
+  case AMDGPU::IGLPStrategyID::MFMASmallGemmOptID:
     return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
-  case MFMASmallGemmSingleWaveOptID:
+  case AMDGPU::IGLPStrategyID::MFMASmallGemmSingleWaveOptID:
     return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
-  case MFMAExpInterleaveID:
+  case AMDGPU::IGLPStrategyID::MFMAExpInterleaveID:
     return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII);
-  case MFMAExpSimpleInterleaveID:
+  case AMDGPU::IGLPStrategyID::MFMAExpSimpleInterleaveID:
     return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, TII);
+  case AMDGPU::IGLPStrategyID::MFMAValuSpacingOptID:
+    return std::make_unique<MFMAValuSpacingOpt>(DAG, TII);
   }
 
   llvm_unreachable("Unknown IGLPStrategyID");
@@ -2702,8 +2767,8 @@ void 
IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
 }
 
 bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
-  IGLPStrategyID StrategyID =
-      (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
+  auto StrategyID = static_cast<AMDGPU::IGLPStrategyID>(
+      SU.getInstr()->getOperand(0).getImm());
   auto S = createIGLPStrategy(StrategyID, DAG, TII);
   if (!S->shouldApplyStrategy(DAG, Phase))
     return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h 
b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index 0688f07873493..3b8ecd1fc14e9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H
 
+#include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include <memory>
 
@@ -17,11 +18,31 @@ namespace llvm {
 namespace AMDGPU {
 // The current phase of instruction scheduling
 enum class SchedulingPhase { Initial, PreRAReentry, PostRA };
+
+/// \c llvm.amdgcn.iglp.opt / \c IGLP_OPT immediate; must match
+/// \c createIGLPStrategy in AMDGPUIGroupLP.cpp.
+enum class IGLPStrategyID : int {
+  MFMASmallGemmOptID = 0,
+  MFMASmallGemmSingleWaveOptID = 1,
+  MFMAExpInterleaveID = 2,
+  MFMAExpSimpleInterleaveID = 3,
+  MFMAValuSpacingOptID = 4,
+};
+
 } // namespace AMDGPU
 
 std::unique_ptr<ScheduleDAGMutation>
 createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase);
 
+/// V_PK unpack and related MIR cleanup before the pre-RA scheduler. By 
default,
+/// a schedule region is transformed only when it contains \c IGLP_OPT with
+/// immediate \c MFMAValuSpacingOptID (4) as well as V_PK ops.
+class AMDGPUIGLPUnpackPass : public PassInfoMixin<AMDGPUIGLPUnpackPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+
 } // namespace llvm
 
 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMFMAIGROUPLP_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def 
b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 8a046e83548cc..ff464ca657970 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -112,6 +112,7 @@ MACHINE_FUNCTION_ANALYSIS("amdgpu-resource-usage", 
AMDGPUResourceUsageAnalysis(*
 #ifndef MACHINE_FUNCTION_PASS
 #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
+MACHINE_FUNCTION_PASS("amdgpu-iglp-unpack", AMDGPUIGLPUnpackPass())
 MACHINE_FUNCTION_PASS("amdgpu-insert-delay-alu", AMDGPUInsertDelayAluPass())
 MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this))
 MACHINE_FUNCTION_PASS("amdgpu-lower-vgpr-encoding", 
AMDGPULowerVGPREncodingPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index daa9f933fce59..5ca481deef518 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -556,6 +556,11 @@ static cl::opt<bool> EnablePreRAOptimizations(
     cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
     cl::Hidden);
 
+static cl::opt<bool> EnableAMDGPUIGLPUnpack(
+    "amdgpu-enable-iglp-unpack",
+    cl::desc("Run AMDGPU IGLP unpack pass before pre-RA scheduling"),
+    cl::init(true), cl::Hidden);
+
 static cl::opt<bool> EnablePromoteKernelArguments(
     "amdgpu-enable-promote-kernel-arguments",
     cl::desc("Enable promotion of flat kernel pointer arguments to global"),
@@ -721,6 +726,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void 
LLVMInitializeAMDGPUTarget() {
   initializeGCNNSAReassignLegacyPass(*PR);
   initializeGCNPreRAOptimizationsLegacyPass(*PR);
   initializeGCNPreRALongBranchRegLegacyPass(*PR);
+  initializeAMDGPUIGLPUnpackLegacyPass(*PR);
   initializeGCNRewritePartialRegUsesLegacyPass(*PR);
   initializeGCNRegPressurePrinterPass(*PR);
   initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
@@ -1724,6 +1730,9 @@ void GCNPassConfig::addOptimizedRegAlloc() {
   if (EnableRewritePartialRegUses)
     insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
 
+  if (EnableAMDGPUIGLPUnpack)
+    insertPass(&RenameIndependentSubregsID, &AMDGPUIGLPUnpackID);
+
   if (isPassEnabled(EnablePreRAOptimizations))
     insertPass(&MachineSchedulerID, &GCNPreRAOptimizationsID);
 
@@ -2498,6 +2507,9 @@ Error AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc(
   if (EnableRewritePartialRegUses)
     insertPass<RenameIndependentSubregsPass>(GCNRewritePartialRegUsesPass());
 
+  if (EnableAMDGPUIGLPUnpack)
+    insertPass<RenameIndependentSubregsPass>(AMDGPUIGLPUnpackPass());
+
   if (isPassEnabled(EnablePreRAOptimizations))
     insertPass<MachineSchedulerPass>(GCNPreRAOptimizationsPass());
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt 
b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 73d2957516539..95ce25efdf7b9 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -88,6 +88,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUMemoryUtils.cpp
   AMDGPUCoExecSchedStrategy.cpp
   AMDGPUIGroupLP.cpp
+  AMDGPUIGroupFixup.cpp
   AMDGPULowerVGPREncoding.cpp
   AMDGPUMCResourceInfo.cpp
   AMDGPUMarkLastScratchLoad.cpp
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp 
b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index ad24bad1fd5d7..caef31da175fb 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -3244,6 +3244,21 @@ static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
   });
 }
 
+static bool hasIGLPOpt(ScheduleDAGInstrs *DAG, AMDGPU::IGLPStrategyID ID) {
+  return any_of(*DAG, [ID](MachineBasicBlock::iterator MI) {
+    return MI->getOpcode() == AMDGPU::IGLP_OPT && MI->getNumOperands() >= 1 &&
+           MI->getOperand(0).isImm() &&
+           MI->getOperand(0).getImm() == static_cast<int64_t>(ID);
+  });
+}
+
+static bool hasSchedBarrier(ScheduleDAGInstrs *DAG) {
+  return any_of(*DAG, [](MachineBasicBlock::iterator MI) {
+    unsigned Opc = MI->getOpcode();
+    return Opc == AMDGPU::SCHED_BARRIER || Opc == AMDGPU::SCHED_GROUP_BARRIER;
+  });
+}
+
 GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
     bool RemoveKillFlags)
@@ -3252,6 +3267,17 @@ GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
 void GCNPostScheduleDAGMILive::schedule() {
   HasIGLPInstrs = hasIGLPInstrs(this);
   if (HasIGLPInstrs) {
+    // MFMAValuSpacingOpt is a pre-RA strategy whose interleaving is correct
+    // after the initial machine scheduler.  The post-RA scheduler would undo
+    // the reordering, so preserve the pre-RA schedule by skipping here.
+    // When SCHED_[GROUP_]BARRIER coexists with IGLP_OPT, IGroupLP ignores the
+    // IGLP_OPT (they are mutually exclusive), so let post-RA scheduling 
proceed
+    // normally.
+    if (hasIGLPOpt(this, AMDGPU::IGLPStrategyID::MFMAValuSpacingOptID) &&
+        !hasSchedBarrier(this)) {
+      HasIGLPInstrs = false;
+      return;
+    }
     SavedMutations.clear();
     SavedMutations.swap(Mutations);
     addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
diff --git a/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.ll 
b/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.ll
new file mode 100644
index 0000000000000..fad3dde3d06bf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; Full codegen on gfx950.  Two MFMAs fed by loaded floats; three independent
+; i32 muls stored to a second buffer.  sched.barrier(0) isolates the MUL+MFMA
+; region so that address-computation VALUs don't inflate the VALU gap in
+; MFMAValuSpacingOpt.
+;
+; With iglp_opt(4) the expected MFMA/VALU interleaving (ValuGap=1) is:
+;   MFMA, MUL, MFMA, MUL, MUL
+;
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -o - %s | FileCheck %s
+
+declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, 
i32, i32, i32)
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare void @llvm.amdgcn.iglp.opt(i32 immarg)
+declare void @llvm.amdgcn.sched.barrier(i32 immarg)
+
+define amdgpu_kernel void @mfma_valu_iglp4(ptr addrspace(1) %p, ptr 
addrspace(1) %q) #0 {
+; CHECK-LABEL: mfma_valu_iglp4:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v8, 2, v0
+; CHECK-NEXT:    v_mov_b32_e32 v9, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_dwordx2 v[6:7], v8, s[0:1]
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; CHECK-NEXT:    ; sched_barrier mask(0x00000000)
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; CHECK-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v6, v7, v[0:3]
+; CHECK-NEXT:    v_mul_lo_u32 v4, v6, v6
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v6, v7, v[0:3]
+; CHECK-NEXT:    v_mul_lo_u32 v5, v6, v7
+; CHECK-NEXT:    v_mul_lo_u32 v6, v7, v7
+; CHECK-NEXT:    ; iglp_opt mask(0x00000004)
+; CHECK-NEXT:    ; sched_barrier mask(0x00000000)
+; CHECK-NEXT:    s_nop 1
+; CHECK-NEXT:    global_store_dwordx4 v9, v[0:3], s[0:1]
+; CHECK-NEXT:    global_store_dwordx3 v8, v[4:6], s[2:3]
+; CHECK-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %t = load <4 x float>, ptr addrspace(1) %p
+  %gep0 = getelementptr inbounds float, ptr addrspace(1) %p, i32 %tid
+  %gep1 = getelementptr inbounds float, ptr addrspace(1) %gep0, i32 1
+  %f0 = load float, ptr addrspace(1) %gep0
+  %f1 = load float, ptr addrspace(1) %gep1
+  %i0 = bitcast float %f0 to i32
+  %i1 = bitcast float %f1 to i32
+  call void @llvm.amdgcn.sched.barrier(i32 0)
+  %m0 = mul nsw i32 %i0, %i0
+  %m1 = mul nsw i32 %i0, %i1
+  %m2 = mul nsw i32 %i1, %i1
+  call void @llvm.amdgcn.iglp.opt(i32 4)
+  %mai = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %f0, float 
%f1, <4 x float> %t, i32 0, i32 0, i32 0)
+  %mai2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %f0, 
float %f1, <4 x float> %mai, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.sched.barrier(i32 0)
+  store <4 x float> %mai2, ptr addrspace(1) %p
+  %qgep0 = getelementptr inbounds i32, ptr addrspace(1) %q, i32 %tid
+  %qgep1 = getelementptr inbounds i32, ptr addrspace(1) %qgep0, i32 1
+  %qgep2 = getelementptr inbounds i32, ptr addrspace(1) %qgep0, i32 2
+  store i32 %m0, ptr addrspace(1) %qgep0
+  store i32 %m1, ptr addrspace(1) %qgep1
+  store i32 %m2, ptr addrspace(1) %qgep2
+  ret void
+}
+
+attributes #0 = { "uniform-work-group-size"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.mir 
b/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.mir
new file mode 100644
index 0000000000000..7f173ddc8e5aa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/iglp-opt4-mfma-valu-spacing-scheduler.mir
@@ -0,0 +1,35 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# Pre-RA machine scheduler with IGroupLP / MFMAValuSpacingOpt (iglp_opt(4)).
+# With IGLP_OPT 4 the expected MFMA/VALU interleaving (ValuGap=1) is:
+#   MFMA, MUL, MFMA, MUL, MUL
+#
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler 
-o - %s | FileCheck %s
+
+---
+name:            mfma_valu_iglp4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: mfma_valu_iglp4
+    ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128_align2 = 
V_MFMA_F32_4X4X1F32_e64 [[DEF]], [[DEF1]], [[DEF2]], 0, 0, 0, implicit $mode, 
implicit $exec
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 
[[DEF]], [[DEF]], implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128_align2 = 
V_MFMA_F32_4X4X1F32_e64 [[DEF]], [[DEF1]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 
0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw 
V_MUL_LO_U32_e64 [[DEF]], [[DEF1]], implicit $exec
+    ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw 
V_MUL_LO_U32_e64 [[DEF1]], [[DEF1]], implicit $exec
+    ; CHECK-NEXT: IGLP_OPT 4
+    ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_]], implicit 
[[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit 
[[V_MFMA_F32_4X4X1F32_e64_1]]
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:areg_128_align2 = IMPLICIT_DEF
+    %3:vgpr_32 = nsw V_MUL_LO_U32_e64 %0, %0, implicit $exec
+    %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %0, %1, implicit $exec
+    %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %1, %1, implicit $exec
+    %6:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 %0, %1, %2, 0, 0, 0, implicit 
$mode, implicit $exec
+    %7:areg_128_align2 = V_MFMA_F32_4X4X1F32_e64 %0, %1, %6, 0, 0, 0, implicit 
$mode, implicit $exec
+    IGLP_OPT 4
+    S_ENDPGM 0, implicit %3, implicit %4, implicit %5, implicit %7
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/iglp-unpack-mfma-vpk-gfx942.mir 
b/llvm/test/CodeGen/AMDGPU/iglp-unpack-mfma-vpk-gfx942.mir
new file mode 100644
index 0000000000000..76b837dae3ea1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/iglp-unpack-mfma-vpk-gfx942.mir
@@ -0,0 +1,40 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# Reduced from a large FMHA-style region (MFMA tower + VALU + V_PK_FMA_F32 +
+# IGLP_OPT 4 between SCHED_BARRIERs). Pair with `iglp-unpack-mfma-vpk.ll` (IR).
+#
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 
-run-pass=amdgpu-iglp-unpack -o - %s | FileCheck %s
+
+---
+name:            iglp_unpack_mfma_vpk
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: iglp_unpack_mfma_vpk
+    ; CHECK: SCHED_BARRIER 0
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: early-clobber %2:vreg_512_align2 = contract 
V_MFMA_F32_32X32X8F16_vgprcd_e64 [[DEF]].sub0_sub1, [[DEF1]].sub0_sub1, 0, 0, 
0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead 
[[V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_:%[0-9]+]]:vreg_512_align2 = contract 
V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 [[DEF]].sub2_sub3, [[DEF1]].sub2_sub3, 
[[V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_]], 0, 0, 0, implicit $mode, implicit 
$exec
+    ; CHECK-NEXT: dead [[DEF2:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: dead [[V_FMA_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept 
V_FMA_F32_e64 0, [[DEF5]].sub0, 0, [[DEF3]].sub0, 0, [[DEF4]].sub0, 0, 0, 
implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead [[V_FMA_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept 
V_FMA_F32_e64 0, [[DEF5]].sub1, 0, [[DEF3]].sub0, 0, [[DEF4]].sub1, 0, 0, 
implicit $mode, implicit $exec
+    ; CHECK-NEXT: IGLP_OPT 4
+    ; CHECK-NEXT: SCHED_BARRIER 0
+    ; CHECK-NEXT: S_ENDPGM 0
+    SCHED_BARRIER 0
+    %0:vreg_128_align2 = IMPLICIT_DEF
+    %1:vreg_128_align2 = IMPLICIT_DEF
+    early-clobber %2:vreg_512_align2 = contract 
V_MFMA_F32_32X32X8F16_vgprcd_e64 %0.sub0_sub1:vreg_128_align2, 
%1.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 
%0.sub2_sub3:vreg_128_align2, %1.sub2_sub3:vreg_128_align2, %2:vreg_512_align2, 
0, 0, 0, implicit $mode, implicit $exec
+    %3:sgpr_256 = IMPLICIT_DEF
+    %8:sgpr_64 = IMPLICIT_DEF
+    %10:vreg_64_align2 = IMPLICIT_DEF
+    %28:vreg_64_align2 = IMPLICIT_DEF
+    %31:vreg_64_align2 = nofpexcept V_PK_FMA_F32 8, %28:vreg_64_align2, 0, 
%8:sgpr_64, 8, %10:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    IGLP_OPT 4
+    SCHED_BARRIER 0
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/iglp-unpack-mfma-vpk.ll 
b/llvm/test/CodeGen/AMDGPU/iglp-unpack-mfma-vpk.ll
new file mode 100644
index 0000000000000..dc26df7f55783
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/iglp-unpack-mfma-vpk.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; Making sure iglp_opt(4) will break V_PK_FMA into two V_FMA.
+;
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -O2 -o - %s | FileCheck %s
+
+define amdgpu_kernel void @iglp_unpack_mfma_vpk_ir(ptr addrspace(1) %p, i64 
%a, i64 %b) #0 {
+; CHECK-LABEL: iglp_unpack_mfma_vpk_ir:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x0
+; CHECK-NEXT:    s_load_dwordx2 s[20:21], s[4:5], 0x10
+; CHECK-NEXT:    ; kill: killed $sgpr4_sgpr5
+; CHECK-NEXT:    v_mov_b32_e32 v16, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v18, s18
+; CHECK-NEXT:    v_mov_b32_e32 v19, s19
+; CHECK-NEXT:    ; sched_barrier mask(0x00000000)
+; CHECK-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; CHECK-NEXT:    v_mov_b64_e32 v[20:21], s[20:21]
+; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; CHECK-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; CHECK-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; CHECK-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; CHECK-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; CHECK-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; CHECK-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; CHECK-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; CHECK-NEXT:    s_nop 1
+; CHECK-NEXT:    v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[18:19], v[20:21], 
v[0:15]
+; CHECK-NEXT:    s_nop 10
+; CHECK-NEXT:    v_fma_f32 v0, v0, s18, v0
+; CHECK-NEXT:    v_fma_f32 v1, v1, s19, v1
+; CHECK-NEXT:    ; iglp_opt mask(0x00000004)
+; CHECK-NEXT:    ; sched_barrier mask(0x00000000)
+; CHECK-NEXT:    global_store_dwordx2 v16, v[0:1], s[16:17]
+; CHECK-NEXT:    s_endpgm
+entry:
+  call void @llvm.amdgcn.sched.barrier(i32 0)
+  %acc = load <16 x float>, ptr addrspace(1) %p
+  %m = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 %a, 
i64 %b, <16 x float> %acc, i32 0, i32 0, i32 0)
+  %e0 = extractelement <16 x float> %m, i64 0
+  %e1 = extractelement <16 x float> %m, i64 1
+  %v = insertelement <2 x float> poison, float %e0, i64 0
+  %v2 = insertelement <2 x float> %v, float %e1, i64 1
+  %c = load <2 x float>, ptr addrspace(1) %p
+  %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %v2, <2 x float> %c, <2 
x float> %v2)
+  call void @llvm.amdgcn.iglp.opt(i32 4)
+  call void @llvm.amdgcn.sched.barrier(i32 0)
+  store <2 x float> %fma, ptr addrspace(1) %p
+  ret void
+}
+
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64, i64, <16 x 
float>, i32, i32, i32)
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
+declare void @llvm.amdgcn.sched.barrier(i32 immarg)
+declare void @llvm.amdgcn.iglp.opt(i32 immarg)
+
+attributes #0 = { "uniform-work-group-size"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll 
b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index c49b2b927bd31..5f88d9eed5eac 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -201,6 +201,7 @@
 ; GCN-O2-NEXT: register-coalescer
 ; GCN-O2-NEXT: rename-independent-subregs
 ; GCN-O2-NEXT: amdgpu-rewrite-partial-reg-uses
+; GCN-O2-NEXT: amdgpu-iglp-unpack
 ; GCN-O2-NEXT: machine-scheduler
 ; GCN-O2-NEXT: amdgpu-pre-ra-optimizations
 ; GCN-O2-NEXT: si-wqm
@@ -370,6 +371,7 @@
 ; GCN-O3-NEXT: register-coalescer
 ; GCN-O3-NEXT: rename-independent-subregs
 ; GCN-O3-NEXT: amdgpu-rewrite-partial-reg-uses
+; GCN-O3-NEXT: amdgpu-iglp-unpack
 ; GCN-O3-NEXT: machine-scheduler
 ; GCN-O3-NEXT: amdgpu-pre-ra-optimizations
 ; GCN-O3-NEXT: si-wqm
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll 
b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index cf127b0bc0d3b..8ea844e401c14 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -360,6 +360,7 @@
 ; GCN-O1-NEXT:        Register Coalescer
 ; GCN-O1-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O1-NEXT:        Rewrite Partial Register Uses
+; GCN-O1-NEXT:        AMDGPU IGLP unpack
 ; GCN-O1-NEXT:        Machine Instruction Scheduler
 ; GCN-O1-NEXT:        SI Whole Quad Mode
 ; GCN-O1-NEXT:        SI optimize exec mask operations pre-RA
@@ -676,6 +677,7 @@
 ; GCN-O1-OPTS-NEXT:        Register Coalescer
 ; GCN-O1-OPTS-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O1-OPTS-NEXT:        Rewrite Partial Register Uses
+; GCN-O1-OPTS-NEXT:        AMDGPU IGLP unpack
 ; GCN-O1-OPTS-NEXT:        Machine Instruction Scheduler
 ; GCN-O1-OPTS-NEXT:        AMDGPU Pre-RA optimizations
 ; GCN-O1-OPTS-NEXT:        SI Whole Quad Mode
@@ -997,6 +999,7 @@
 ; GCN-O2-NEXT:        Register Coalescer
 ; GCN-O2-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O2-NEXT:        Rewrite Partial Register Uses
+; GCN-O2-NEXT:        AMDGPU IGLP unpack
 ; GCN-O2-NEXT:        Machine Instruction Scheduler
 ; GCN-O2-NEXT:        AMDGPU Pre-RA optimizations
 ; GCN-O2-NEXT:        SI Whole Quad Mode
@@ -1332,6 +1335,7 @@
 ; GCN-O3-NEXT:        Register Coalescer
 ; GCN-O3-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O3-NEXT:        Rewrite Partial Register Uses
+; GCN-O3-NEXT:        AMDGPU IGLP unpack
 ; GCN-O3-NEXT:        Machine Instruction Scheduler
 ; GCN-O3-NEXT:        AMDGPU Pre-RA optimizations
 ; GCN-O3-NEXT:        SI Whole Quad Mode
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn 
b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index 75a97a8b840e5..d82bb3eacf1b8 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -152,6 +152,7 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPUHSAMetadataStreamer.cpp",
     "AMDGPUHazardLatency.cpp",
     "AMDGPUIGroupLP.cpp",
+    "AMDGPUIGLPUnpack.cpp",
     "AMDGPUISelDAGToDAG.cpp",
     "AMDGPUISelLowering.cpp",
     "AMDGPUImageIntrinsicOptimizer.cpp",

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to