https://github.com/phoebewang updated 
https://github.com/llvm/llvm-project/pull/191368

>From 3a58e769c59f87fa00b123cb80e778a047f99f8a Mon Sep 17 00:00:00 2001
From: Phoebe Wang <[email protected]>
Date: Fri, 10 Apr 2026 17:10:06 +0800
Subject: [PATCH 1/8] [X86][APX] Return CopyMI when added in
 foldMemoryOperandImpl

Fixes: #190962 #191165
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h  | 11 +++---
 llvm/lib/CodeGen/InlineSpiller.cpp           | 17 +++++++--
 llvm/lib/CodeGen/LiveRangeEdit.cpp           | 14 ++++++--
 llvm/lib/CodeGen/PeepholeOptimizer.cpp       |  7 ++--
 llvm/lib/CodeGen/TargetInstrInfo.cpp         | 12 ++++---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp |  2 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.h   | 12 +++----
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp       |  4 +--
 llvm/lib/Target/AMDGPU/SIInstrInfo.h         |  2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp     |  6 ++--
 llvm/lib/Target/RISCV/RISCVInstrInfo.h       |  4 +--
 llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp |  4 +--
 llvm/lib/Target/SystemZ/SystemZInstrInfo.h   | 14 ++++----
 llvm/lib/Target/X86/X86FastISel.cpp          |  3 +-
 llvm/lib/Target/X86/X86InstrInfo.cpp         | 35 ++++++++++--------
 llvm/lib/Target/X86/X86InstrInfo.h           | 17 ++++-----
 llvm/test/CodeGen/X86/apx/or.ll              | 38 ++++++++++++++++++++
 17 files changed, 138 insertions(+), 64 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h 
b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index cd5561e57d033..e69f377fd0bd4 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1257,14 +1257,14 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
   /// If VRM is passed, the assigned physregs can be inspected by target to
   /// decide on using an opcode (note that those assignments can still change).
   MachineInstr *foldMemoryOperand(MachineInstr &MI, ArrayRef<unsigned> Ops,
-                                  int FI,
+                                  int FI, MachineInstr *&CopyMI,
                                   LiveIntervals *LIS = nullptr,
                                   VirtRegMap *VRM = nullptr) const;
 
   /// Same as the previous version except it allows folding of any load and
   /// store from / to any address, not just from a specific stack slot.
   MachineInstr *foldMemoryOperand(MachineInstr &MI, ArrayRef<unsigned> Ops,
-                                  MachineInstr &LoadMI,
+                                  MachineInstr &LoadMI, MachineInstr *&CopyMI,
                                   LiveIntervals *LIS = nullptr) const;
 
   /// This function defines the logic to lower COPY instruction to
@@ -1446,7 +1446,7 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
   foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
                         ArrayRef<unsigned> Ops,
                         MachineBasicBlock::iterator InsertPt, int FrameIndex,
-                        LiveIntervals *LIS = nullptr,
+                        MachineInstr *&CopyMI, LiveIntervals *LIS = nullptr,
                         VirtRegMap *VRM = nullptr) const {
     return nullptr;
   }
@@ -1459,7 +1459,7 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
   virtual MachineInstr *foldMemoryOperandImpl(
       MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
       MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
-      LiveIntervals *LIS = nullptr) const {
+      MachineInstr *&CopyMI, LiveIntervals *LIS = nullptr) const {
     return nullptr;
   }
 
@@ -1845,7 +1845,8 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
   virtual MachineInstr *optimizeLoadInstr(MachineInstr &MI,
                                           const MachineRegisterInfo *MRI,
                                           Register &FoldAsLoadDefReg,
-                                          MachineInstr *&DefMI) const;
+                                          MachineInstr *&DefMI,
+                                          MachineInstr *&CopyMI) const;
 
   /// 'Reg' is known to be defined by a move immediate instruction,
   /// try to fold the immediate into the use instruction.
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp 
b/llvm/lib/CodeGen/InlineSpiller.cpp
index 668c7c0a78098..fab04c44371e9 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -1016,9 +1016,11 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, 
unsigned>> Ops,
       MI->untieRegOperand(Idx);
     }
 
+  MachineInstr *CopyMI = nullptr;
   MachineInstr *FoldMI =
-      LoadMI ? TII.foldMemoryOperand(*MI, FoldOps, *LoadMI, &LIS)
-             : TII.foldMemoryOperand(*MI, FoldOps, StackSlot, &LIS, &VRM);
+      LoadMI
+          ? TII.foldMemoryOperand(*MI, FoldOps, *LoadMI, CopyMI, &LIS)
+          : TII.foldMemoryOperand(*MI, FoldOps, StackSlot, CopyMI, &LIS, &VRM);
   if (!FoldMI) {
     // Re-tie operands.
     for (auto Tied : TiedOps)
@@ -1050,7 +1052,16 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, 
unsigned>> Ops,
   if (TII.isStoreToStackSlot(*MI, FI) &&
       HSpiller.rmFromMergeableSpills(*MI, FI))
     --NumSpills;
-  LIS.ReplaceMachineInstrInMaps(*MI, *FoldMI);
+  SlotIndex FoldIdx = LIS.ReplaceMachineInstrInMaps(*MI, *FoldMI);
+  if (CopyMI) {
+    LIS.InsertMachineInstrInMaps(*CopyMI);
+    if (!MRI.isSSA()) {
+      SlotIndex CopyIdx = LIS.InsertMachineInstrInMaps(*CopyMI).getRegSlot();
+      LiveInterval &LI = LIS.getInterval(CopyMI->getOperand(0).getReg());
+      VNInfo *VNI = LI.getNextValue(CopyIdx, LIS.getVNInfoAllocator());
+      LI.addSegment(LiveRange::Segment(CopyIdx, FoldIdx.getRegSlot(), VNI));
+    }
+  }
   // Update the call info.
   if (MI->isCandidateForAdditionalCallInfo())
     MI->getMF()->moveAdditionalCallInfo(MI, FoldMI);
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp 
b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 4e1b0c0c66e69..638adf18997ba 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -151,11 +151,21 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   if (UseMI->readsWritesVirtualRegister(LI->reg(), &Ops).second)
     return false;
 
-  MachineInstr *FoldMI = TII.foldMemoryOperand(*UseMI, Ops, *DefMI, &LIS);
+  MachineInstr *CopyMI = nullptr;
+  MachineInstr *FoldMI =
+      TII.foldMemoryOperand(*UseMI, Ops, *DefMI, CopyMI, &LIS);
   if (!FoldMI)
     return false;
   LLVM_DEBUG(dbgs() << "                folded: " << *FoldMI);
-  LIS.ReplaceMachineInstrInMaps(*UseMI, *FoldMI);
+  SlotIndex FoldIdx = LIS.ReplaceMachineInstrInMaps(*UseMI, *FoldMI);
+  if (CopyMI) {
+    SlotIndex CopyIdx = LIS.InsertMachineInstrInMaps(*CopyMI).getRegSlot();
+    if (!MRI.isSSA()) {
+      LiveInterval &LI = LIS.getInterval(CopyMI->getOperand(0).getReg());
+      VNInfo *VNI = LI.getNextValue(CopyIdx, LIS.getVNInfoAllocator());
+      LI.addSegment(LiveRange::Segment(CopyIdx, FoldIdx.getRegSlot(), VNI));
+    }
+  }
   // Update the call info.
   if (UseMI->shouldUpdateAdditionalCallInfo())
     UseMI->getMF()->moveAdditionalCallInfo(UseMI, FoldMI);
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp 
b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index 73aecda4e522c..9365ea883eec9 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -1865,8 +1865,9 @@ bool PeepholeOptimizer::run(MachineFunction &MF) {
             // we need it for markUsesInDebugValueAsUndef().
             Register FoldedReg = FoldAsLoadDefReg;
             MachineInstr *DefMI = nullptr;
-            if (MachineInstr *FoldMI =
-                    TII->optimizeLoadInstr(*MI, MRI, FoldAsLoadDefReg, DefMI)) 
{
+            MachineInstr *CopyMI = nullptr;
+            if (MachineInstr *FoldMI = TII->optimizeLoadInstr(
+                    *MI, MRI, FoldAsLoadDefReg, DefMI, CopyMI)) {
               // Update LocalMIs since we replaced MI with FoldMI and deleted
               // DefMI.
               LLVM_DEBUG(dbgs() << "Replacing: " << *MI);
@@ -1874,6 +1875,8 @@ bool PeepholeOptimizer::run(MachineFunction &MF) {
               LocalMIs.erase(MI);
               LocalMIs.erase(DefMI);
               LocalMIs.insert(FoldMI);
+              if (CopyMI)
+                LocalMIs.insert(CopyMI);
               // Update the call info.
               if (MI->shouldUpdateAdditionalCallInfo())
                 MI->getMF()->moveAdditionalCallInfo(MI, FoldMI);
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp 
b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 3e3b935135350..ba836df02048c 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -518,7 +518,8 @@ MCInst TargetInstrInfo::getNop() const { 
llvm_unreachable("Not implemented"); }
 MachineInstr *TargetInstrInfo::optimizeLoadInstr(MachineInstr &MI,
                                                  const MachineRegisterInfo 
*MRI,
                                                  Register &FoldAsLoadDefReg,
-                                                 MachineInstr *&DefMI) const {
+                                                 MachineInstr *&DefMI,
+                                                 MachineInstr *&CopyMI) const {
   // Check whether we can move DefMI here.
   DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
   assert(DefMI);
@@ -544,7 +545,8 @@ MachineInstr 
*TargetInstrInfo::optimizeLoadInstr(MachineInstr &MI,
     return nullptr;
 
   // Check whether we can fold the def into SrcOperandId.
-  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
+  if (MachineInstr *FoldMI =
+          foldMemoryOperand(MI, SrcOperandIds, *DefMI, CopyMI)) {
     FoldAsLoadDefReg = 0;
     return FoldMI;
   }
@@ -703,6 +705,7 @@ static MachineInstr *foldInlineAsmMemOperand(MachineInstr 
&MI,
 
 MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
                                                  ArrayRef<unsigned> Ops, int 
FI,
+                                                 MachineInstr *&CopyMI,
                                                  LiveIntervals *LIS,
                                                  VirtRegMap *VRM) const {
   auto Flags = MachineMemOperand::MONone;
@@ -751,7 +754,7 @@ MachineInstr 
*TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
     return foldInlineAsmMemOperand(MI, Ops, FI, *this);
   } else {
     // Ask the target to do the actual folding.
-    NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, FI, LIS, VRM);
+    NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, FI, CopyMI, LIS, VRM);
   }
 
   if (NewMI) {
@@ -804,6 +807,7 @@ MachineInstr 
*TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
 MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
                                                  ArrayRef<unsigned> Ops,
                                                  MachineInstr &LoadMI,
+                                                 MachineInstr *&CopyMI,
                                                  LiveIntervals *LIS) const {
   assert(LoadMI.canFoldAsLoad() && "LoadMI isn't foldable!");
 #ifndef NDEBUG
@@ -830,7 +834,7 @@ MachineInstr 
*TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
     return foldInlineAsmMemOperand(MI, Ops, FrameIndex, *this);
   } else {
     // Ask the target to do the actual folding.
-    NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, LoadMI, LIS);
+    NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, LoadMI, CopyMI, LIS);
   }
 
   if (!NewMI)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp 
b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 4094526574d7a..988fe7014ddcf 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -6807,7 +6807,7 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
 
 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
-    MachineBasicBlock::iterator InsertPt, int FrameIndex,
+    MachineBasicBlock::iterator InsertPt, int FrameIndex, MachineInstr 
*&CopyMI,
     LiveIntervals *LIS, VirtRegMap *VRM) const {
   // This is a bit of a hack. Consider this instruction:
   //
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h 
b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index a1ec4cbffdf02..fa63c56739d62 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -382,12 +382,12 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo 
{
   bool isSubregFoldable() const override { return true; }
 
   using TargetInstrInfo::foldMemoryOperandImpl;
-  MachineInstr *
-  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
-                        ArrayRef<unsigned> Ops,
-                        MachineBasicBlock::iterator InsertPt, int FrameIndex,
-                        LiveIntervals *LIS = nullptr,
-                        VirtRegMap *VRM = nullptr) const override;
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+                                      ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
+                                      int FrameIndex, MachineInstr *&CopyMI,
+                                      LiveIntervals *LIS = nullptr,
+                                      VirtRegMap *VRM = nullptr) const 
override;
 
   /// \returns true if a branch from an instruction with opcode \p BranchOpc
   ///  bytes is capable of jumping to a position \p BrOffset bytes away.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 3179c65340d18..bc3052b139d18 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10656,8 +10656,8 @@ bool llvm::SIInstrInfo::isWave32() const { return 
ST.isWave32(); }
 
 MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
-    MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
-    VirtRegMap *VRM) const {
+    MachineBasicBlock::iterator InsertPt, int FrameIndex, MachineInstr 
*&CopyMI,
+    LiveIntervals *LIS, VirtRegMap *VRM) const {
   // This is a bit of a hack (copied from AArch64). Consider this instruction:
   //
   //   %0:sreg_32 = COPY $m0
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index a58e281de17a1..3c1232ac098a0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1682,7 +1682,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
                                       ArrayRef<unsigned> Ops,
                                       MachineBasicBlock::iterator InsertPt,
-                                      int FrameIndex,
+                                      int FrameIndex, MachineInstr *&CopyMI,
                                       LiveIntervals *LIS = nullptr,
                                       VirtRegMap *VRM = nullptr) const 
override;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp 
b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index db559f4949904..ee10d71727c39 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -905,8 +905,8 @@ std::optional<unsigned> getFoldedOpcode(MachineFunction 
&MF, MachineInstr &MI,
 // This is the version used during InlineSpiller::spillAroundUses
 MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
-    MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
-    VirtRegMap *VRM) const {
+    MachineBasicBlock::iterator InsertPt, int FrameIndex, MachineInstr 
*&CopyMI,
+    LiveIntervals *LIS, VirtRegMap *VRM) const {
 
   std::optional<unsigned> LoadOpc = getFoldedOpcode(MF, MI, Ops, STI);
   if (!LoadOpc)
@@ -952,7 +952,7 @@ static unsigned getLoadPredicatedOpcode(unsigned Opcode) {
 MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
     MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
-    LiveIntervals *LIS) const {
+    MachineInstr *&CopyMI, LiveIntervals *LIS) const {
   // For now, only handle RISCV::PseudoCCMOVGPR.
   if (MI.getOpcode() != RISCV::PseudoCCMOVGPR)
     return nullptr;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h 
b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 5c36ff7525200..119b2a7eae6bf 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -130,14 +130,14 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
                                       ArrayRef<unsigned> Ops,
                                       MachineBasicBlock::iterator InsertPt,
-                                      int FrameIndex,
+                                      int FrameIndex, MachineInstr *&CopyMI,
                                       LiveIntervals *LIS = nullptr,
                                       VirtRegMap *VRM = nullptr) const 
override;
 
   MachineInstr *foldMemoryOperandImpl(
       MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
       MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
-      LiveIntervals *LIS = nullptr) const override;
+      MachineInstr *&CopyMI, LiveIntervals *LIS = nullptr) const override;
 
   // Materializes the given integer Val into DstReg.
   void movImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp 
b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index a76424eff1e49..5d85a64844592 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -1225,7 +1225,7 @@ SystemZInstrInfo::getInverseOpcode(unsigned Opcode) const 
{
 
 MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
-    MachineBasicBlock::iterator InsertPt, int FrameIndex,
+    MachineBasicBlock::iterator InsertPt, int FrameIndex, MachineInstr 
*&CopyMI,
     LiveIntervals *LIS, VirtRegMap *VRM) const {
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -1558,7 +1558,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
 MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
     MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
-    LiveIntervals *LIS) const {
+    MachineInstr *&CopyMI, LiveIntervals *LIS) const {
   MachineRegisterInfo *MRI = &MF.getRegInfo();
   MachineBasicBlock *MBB = MI.getParent();
 
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h 
b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 029fe93d5b15c..9fbd8e9a28d1d 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -292,16 +292,16 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
                                    bool Invert) const override;
   std::optional<unsigned> getInverseOpcode(unsigned Opcode) const override;
 
-  MachineInstr *
-  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
-                        ArrayRef<unsigned> Ops,
-                        MachineBasicBlock::iterator InsertPt, int FrameIndex,
-                        LiveIntervals *LIS = nullptr,
-                        VirtRegMap *VRM = nullptr) const override;
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+                                      ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
+                                      int FrameIndex, MachineInstr *&CopyMI,
+                                      LiveIntervals *LIS = nullptr,
+                                      VirtRegMap *VRM = nullptr) const 
override;
   MachineInstr *foldMemoryOperandImpl(
       MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
       MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
-      LiveIntervals *LIS = nullptr) const override;
+      MachineInstr *&CopyMI, LiveIntervals *LIS = nullptr) const override;
   bool expandPostRAPseudo(MachineInstr &MBBI) const override;
   bool reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const
     override;
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp 
b/llvm/lib/Target/X86/X86FastISel.cpp
index 3dbe6d14c610e..2017897166f3c 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -3992,9 +3992,10 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, 
unsigned OpNo,
   SmallVector<MachineOperand, 8> AddrOps;
   AM.getFullAddress(AddrOps);
 
+  MachineInstr *CopyMI = nullptr;
   MachineInstr *Result = XII.foldMemoryOperandImpl(
       *FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, 
LI->getAlign(),
-      /*AllowCommute=*/true);
+      /*AllowCommute=*/true, CopyMI);
   if (!Result)
     return false;
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp 
b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 5b6858f59e6d6..50e0bfcac9321 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -7485,7 +7485,8 @@ static void printFailMsgforFold(const MachineInstr &MI, 
unsigned Idx) {
 MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
     ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
-    unsigned Size, Align Alignment, bool AllowCommute) const {
+    unsigned Size, Align Alignment, bool AllowCommute,
+    MachineInstr *&CopyMI) const {
   bool isSlowTwoMemOps = Subtarget.slowTwoMemOps();
   unsigned Opc = MI.getOpcode();
 
@@ -7598,11 +7599,13 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
         return NewMI;
 
       const TargetRegisterClass &RC = *MF.getRegInfo().getRegClass(SrcReg);
-      Register NewSrc = MF.getRegInfo().createVirtualRegister(&RC);
-      BuildMI(*NewMI->getParent(), *NewMI, MI.getDebugLoc(),
-              get(TargetOpcode::COPY))
-          .addReg(NewSrc, RegState::Define)
-          .addReg(SrcReg);
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+      Register NewSrc = MRI.isSSA() ? MRI.createVirtualRegister(&RC)
+                                    : MI.getOperand(0).getReg();
+      CopyMI = BuildMI(*NewMI->getParent(), *NewMI, MI.getDebugLoc(),
+                       get(TargetOpcode::COPY))
+                   .addReg(NewSrc, RegState::Define)
+                   .addReg(SrcReg);
       NewMI->getOperand(1).setReg(NewSrc);
     }
     return NewMI;
@@ -7618,7 +7621,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     }
     // Attempt to fold with the commuted version of the instruction.
     NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, Size,
-                                  Alignment, /*AllowCommute=*/false);
+                                  Alignment, /*AllowCommute=*/false, CopyMI);
     if (NewMI)
       return NewMI;
     // Folding failed again - undo the commute before returning.
@@ -7631,8 +7634,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
 
 MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
-    MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
-    VirtRegMap *VRM) const {
+    MachineBasicBlock::iterator InsertPt, int FrameIndex, MachineInstr 
*&CopyMI,
+    LiveIntervals *LIS, VirtRegMap *VRM) const {
   // Check switch flag
   if (NoFusing)
     return nullptr;
@@ -7665,9 +7668,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
         std::min(Alignment, Subtarget.getFrameLowering()->getStackAlign());
 
   auto Impl = [&]() {
-    return foldMemoryOperandImpl(MF, MI, Ops[0],
-                                 MachineOperand::CreateFI(FrameIndex), 
InsertPt,
-                                 Size, Alignment, /*AllowCommute=*/true);
+    return foldMemoryOperandImpl(
+        MF, MI, Ops[0], MachineOperand::CreateFI(FrameIndex), InsertPt, Size,
+        Alignment, /*AllowCommute=*/true, CopyMI);
   };
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
@@ -8147,7 +8150,7 @@ static bool isNonFoldablePartialRegisterLoad(const 
MachineInstr &LoadMI,
 MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
     MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
-    LiveIntervals *LIS) const {
+    MachineInstr *&CopyMI, LiveIntervals *LIS) const {
 
   // If LoadMI is a masked load, check MI having the same mask.
   const MCInstrDesc &MCID = get(LoadMI.getOpcode());
@@ -8199,7 +8202,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
     if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
       return nullptr;
-    return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS);
+    return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, CopyMI,
+                                 LIS);
   }
 
   // Check switch flag
@@ -8444,7 +8448,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   }
   }
   return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
-                               /*Size=*/0, Alignment, /*AllowCommute=*/true);
+                               /*Size=*/0, Alignment, /*AllowCommute=*/true,
+                               CopyMI);
 }
 
 MachineInstr *
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h 
b/llvm/lib/Target/X86/X86InstrInfo.h
index 9695d9a79ec96..e9c5d7ccb32bf 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -492,19 +492,19 @@ class X86InstrInfo final : public X86GenInstrInfo {
   /// is likely that the referenced instruction has been changed.
   ///
   /// \returns true on success.
-  MachineInstr *
-  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
-                        ArrayRef<unsigned> Ops,
-                        MachineBasicBlock::iterator InsertPt, int FrameIndex,
-                        LiveIntervals *LIS = nullptr,
-                        VirtRegMap *VRM = nullptr) const override;
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+                                      ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
+                                      int FrameIndex, MachineInstr *&CopyMI,
+                                      LiveIntervals *LIS = nullptr,
+                                      VirtRegMap *VRM = nullptr) const 
override;
 
   /// Same as the previous version except it allows folding of any load and
   /// store from / to any address, not just from a specific stack slot.
   MachineInstr *foldMemoryOperandImpl(
       MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
       MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
-      LiveIntervals *LIS = nullptr) const override;
+      MachineInstr *&CopyMI, LiveIntervals *LIS = nullptr) const override;
 
   bool
   unfoldMemoryOperand(MachineFunction &MF, MachineInstr &MI, Register Reg,
@@ -582,7 +582,8 @@ class X86InstrInfo final : public X86GenInstrInfo {
                                       ArrayRef<MachineOperand> MOs,
                                       MachineBasicBlock::iterator InsertPt,
                                       unsigned Size, Align Alignment,
-                                      bool AllowCommute) const;
+                                      bool AllowCommute,
+                                      MachineInstr *&CopyMI) const;
 
   bool isHighLatencyDef(int opc) const override;
 
diff --git a/llvm/test/CodeGen/X86/apx/or.ll b/llvm/test/CodeGen/X86/apx/or.ll
index 594ed7b29216b..3bb2c4041308f 100644
--- a/llvm/test/CodeGen/X86/apx/or.ll
+++ b/llvm/test/CodeGen/X86/apx/or.ll
@@ -1240,3 +1240,41 @@ entry:
   store i64 %or, ptr %a
   ret void
 }
+
+define i64 @pr191165(i32 %0, ptr %1) {
+; NDD-LABEL: pr191165:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; NDD-NEXT:    orl %edi, %eax # encoding: [0x09,0xf8]
+; NDD-NEXT:    movl %eax, 0 # encoding: [0x89,0x04,0x25,0x00,0x00,0x00,0x00]
+; NDD-NEXT:    movslq %edi, %rax # encoding: [0x48,0x63,0xc7]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: pr191165:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; IMMONLY-NEXT:    orl %edi, %eax # encoding: [0x09,0xf8]
+; IMMONLY-NEXT:    movl %eax, 0 # encoding: 
[0x89,0x04,0x25,0x00,0x00,0x00,0x00]
+; IMMONLY-NEXT:    movslq %edi, %rax # encoding: [0x48,0x63,0xc7]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: pr191165:
+; MEM:       # %bb.0:
+; MEM-NEXT:    orl (%rsi), %edi, %eax # encoding: 
[0x62,0xf4,0x7c,0x18,0x0b,0x3e]
+; MEM-NEXT:    movl %eax, 0 # encoding: [0x89,0x04,0x25,0x00,0x00,0x00,0x00]
+; MEM-NEXT:    movslq %edi, %rax # encoding: [0x48,0x63,0xc7]
+; MEM-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: pr191165:
+; NF:       # %bb.0:
+; NF-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; NF-NEXT:    orl %edi, %eax # encoding: [0x09,0xf8]
+; NF-NEXT:    movl %eax, 0 # encoding: [0x89,0x04,0x25,0x00,0x00,0x00,0x00]
+; NF-NEXT:    movslq %edi, %rax # encoding: [0x48,0x63,0xc7]
+; NF-NEXT:    retq # encoding: [0xc3]
+  %3 = load i32, ptr %1, align 4
+  %4 = or i32 %3, %0
+  store volatile i32 %4, ptr null, align 4
+  %5 = sext i32 %0 to i64
+  ret i64 %5
+}

>From 789470e6a9a1b91cf541ebf1950eb84ca935b8f0 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <[email protected]>
Date: Fri, 10 Apr 2026 17:16:17 +0800
Subject: [PATCH 2/8] Add missing test

---
 clang/test/CodeGen/X86/pr190962.ll | 65 ++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 clang/test/CodeGen/X86/pr190962.ll

diff --git a/clang/test/CodeGen/X86/pr190962.ll 
b/clang/test/CodeGen/X86/pr190962.ll
new file mode 100644
index 0000000000000..acd59b2ee3c46
--- /dev/null
+++ b/clang/test/CodeGen/X86/pr190962.ll
@@ -0,0 +1,65 @@
+; REQUIRES: x86-registered-target
+; RUN: %clang -O1 -mapx-features=ndd --target=x86_64-pc-windows-gnu -S %s 
-mllvm -verify-machineinstrs -o /dev/null
+
+;; Check no crash when building below IR with Clang.
+
+define i32 @foo(ptr %0, ptr %1, ptr %2, i64 %3, i64 %4, i64 %5) {
+  %7 = call i64 
@"_ZZN3jxl15PatchDictionary6DecodeEP22JxlMemoryManagerStructPNS_9BitReaderEyyyPbENK3$_0clEy"()
+  %8 = mul i64 %3, %4
+  %9 = icmp ugt i64 1, %8
+  br i1 %9, label %common.ret1, label %10
+
+common.ret1:                                      ; preds = %26, %23, %16, %6
+  %common.ret1.op = phi i32 [ 0, %23 ], [ 0, %16 ], [ 0, %26 ], [ 0, %6 ]
+  ret i32 %common.ret1.op
+
+10:                                               ; preds = %6
+  %11 = load volatile i64, ptr null, align 8
+  %12 = call i64 
@"_ZZN3jxl15PatchDictionary6DecodeEP22JxlMemoryManagerStructPNS_9BitReaderEyyyPbENK3$_0clEy"()
+  %13 = load volatile i64, ptr null, align 8
+  %14 = or i64 %11, %5
+  %15 = icmp ugt i64 %14, 0
+  br i1 %15, label %16, label %19
+
+16:                                               ; preds = %10
+  call void @_ZN3jxl6StatusC2ENS_10StatusCodeE()
+  %17 = load i32, ptr null, align 4
+  %18 = call i32 (i32, ptr, ...) @_ZN3jxl13StatusMessageENS_6StatusEPKcz(i32 
%17, ptr null, ptr null, i32 0)
+  call void @_ZN3jxl6StatusC2ENS_10StatusCodeE()
+  br label %common.ret1
+
+19:                                               ; preds = %10
+  %20 = call i64 @_ZNK3jxl11ImageBundle5ysizeEv(ptr %1)
+  %21 = or i64 %12, %13
+  %22 = icmp ugt i64 %21, 0
+  br i1 %22, label %23, label %26
+
+23:                                               ; preds = %19
+  call void @_ZN3jxl6StatusC2ENS_10StatusCodeE()
+  %24 = load i32, ptr null, align 4
+  %25 = call i32 (i32, ptr, ...) @_ZN3jxl13StatusMessageENS_6StatusEPKcz(i32 
%24, ptr null, ptr null, i32 1)
+  call void @_ZN3jxl6StatusC2ENS_10StatusCodeE()
+  br label %common.ret1
+
+26:                                               ; preds = %19
+  %27 = icmp ugt i64 1, %3
+  br i1 %27, label %common.ret1, label %28
+
+28:                                               ; preds = %26
+  store i32 0, ptr %0, align 4
+  %29 = call i32 (i32, ptr, ...) @_ZN3jxl13StatusMessageENS_6StatusEPKcz(i32 
0, ptr null, ptr null, i32 0, i64 0, i64 0, i64 %4)
+  unreachable
+}
+
+declare i32 @_ZN3jxl13StatusMessageENS_6StatusEPKcz(i32, ptr, ...)
+
+declare i64 
@"_ZZN3jxl15PatchDictionary6DecodeEP22JxlMemoryManagerStructPNS_9BitReaderEyyyPbENK3$_0clEy"()
+
+declare void @_ZN3jxl6StatusC2ENS_10StatusCodeE()
+
+declare i64 @_ZNK3jxl11ImageBundle5ysizeEv(ptr)
+
+; uselistorder directives
+uselistorder ptr @_ZN3jxl13StatusMessageENS_6StatusEPKcz, { 2, 1, 0 }
+uselistorder ptr 
@"_ZZN3jxl15PatchDictionary6DecodeEP22JxlMemoryManagerStructPNS_9BitReaderEyyyPbENK3$_0clEy",
 { 1, 0 }
+uselistorder ptr @_ZN3jxl6StatusC2ENS_10StatusCodeE, { 3, 2, 1, 0 }

>From d701f200442de49d1661edbd14345b4d1500747f Mon Sep 17 00:00:00 2001
From: Phoebe Wang <[email protected]>
Date: Fri, 10 Apr 2026 21:02:55 +0800
Subject: [PATCH 3/8] Address comments and new failure

---
 clang/test/CodeGen/X86/pr190962.ll    |  65 ----------------
 llvm/lib/CodeGen/InlineSpiller.cpp    |   5 +-
 llvm/lib/CodeGen/LiveRangeEdit.cpp    |   8 +-
 llvm/lib/Target/X86/X86InstrInfo.cpp  |  15 +++-
 llvm/test/CodeGen/X86/apx/or.ll       |  12 +--
 llvm/test/CodeGen/X86/apx/pr191368.ll | 104 ++++++++++++++++++++++++++
 6 files changed, 127 insertions(+), 82 deletions(-)
 delete mode 100644 clang/test/CodeGen/X86/pr190962.ll
 create mode 100644 llvm/test/CodeGen/X86/apx/pr191368.ll

diff --git a/clang/test/CodeGen/X86/pr190962.ll 
b/clang/test/CodeGen/X86/pr190962.ll
deleted file mode 100644
index acd59b2ee3c46..0000000000000
--- a/clang/test/CodeGen/X86/pr190962.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; REQUIRES: x86-registered-target
-; RUN: %clang -O1 -mapx-features=ndd --target=x86_64-pc-windows-gnu -S %s 
-mllvm -verify-machineinstrs -o /dev/null
-
-;; Check no crash when building below IR with Clang.
-
-define i32 @foo(ptr %0, ptr %1, ptr %2, i64 %3, i64 %4, i64 %5) {
-  %7 = call i64 
@"_ZZN3jxl15PatchDictionary6DecodeEP22JxlMemoryManagerStructPNS_9BitReaderEyyyPbENK3$_0clEy"()
-  %8 = mul i64 %3, %4
-  %9 = icmp ugt i64 1, %8
-  br i1 %9, label %common.ret1, label %10
-
-common.ret1:                                      ; preds = %26, %23, %16, %6
-  %common.ret1.op = phi i32 [ 0, %23 ], [ 0, %16 ], [ 0, %26 ], [ 0, %6 ]
-  ret i32 %common.ret1.op
-
-10:                                               ; preds = %6
-  %11 = load volatile i64, ptr null, align 8
-  %12 = call i64 
@"_ZZN3jxl15PatchDictionary6DecodeEP22JxlMemoryManagerStructPNS_9BitReaderEyyyPbENK3$_0clEy"()
-  %13 = load volatile i64, ptr null, align 8
-  %14 = or i64 %11, %5
-  %15 = icmp ugt i64 %14, 0
-  br i1 %15, label %16, label %19
-
-16:                                               ; preds = %10
-  call void @_ZN3jxl6StatusC2ENS_10StatusCodeE()
-  %17 = load i32, ptr null, align 4
-  %18 = call i32 (i32, ptr, ...) @_ZN3jxl13StatusMessageENS_6StatusEPKcz(i32 
%17, ptr null, ptr null, i32 0)
-  call void @_ZN3jxl6StatusC2ENS_10StatusCodeE()
-  br label %common.ret1
-
-19:                                               ; preds = %10
-  %20 = call i64 @_ZNK3jxl11ImageBundle5ysizeEv(ptr %1)
-  %21 = or i64 %12, %13
-  %22 = icmp ugt i64 %21, 0
-  br i1 %22, label %23, label %26
-
-23:                                               ; preds = %19
-  call void @_ZN3jxl6StatusC2ENS_10StatusCodeE()
-  %24 = load i32, ptr null, align 4
-  %25 = call i32 (i32, ptr, ...) @_ZN3jxl13StatusMessageENS_6StatusEPKcz(i32 
%24, ptr null, ptr null, i32 1)
-  call void @_ZN3jxl6StatusC2ENS_10StatusCodeE()
-  br label %common.ret1
-
-26:                                               ; preds = %19
-  %27 = icmp ugt i64 1, %3
-  br i1 %27, label %common.ret1, label %28
-
-28:                                               ; preds = %26
-  store i32 0, ptr %0, align 4
-  %29 = call i32 (i32, ptr, ...) @_ZN3jxl13StatusMessageENS_6StatusEPKcz(i32 
0, ptr null, ptr null, i32 0, i64 0, i64 0, i64 %4)
-  unreachable
-}
-
-declare i32 @_ZN3jxl13StatusMessageENS_6StatusEPKcz(i32, ptr, ...)
-
-declare i64 
@"_ZZN3jxl15PatchDictionary6DecodeEP22JxlMemoryManagerStructPNS_9BitReaderEyyyPbENK3$_0clEy"()
-
-declare void @_ZN3jxl6StatusC2ENS_10StatusCodeE()
-
-declare i64 @_ZNK3jxl11ImageBundle5ysizeEv(ptr)
-
-; uselistorder directives
-uselistorder ptr @_ZN3jxl13StatusMessageENS_6StatusEPKcz, { 2, 1, 0 }
-uselistorder ptr 
@"_ZZN3jxl15PatchDictionary6DecodeEP22JxlMemoryManagerStructPNS_9BitReaderEyyyPbENK3$_0clEy",
 { 1, 0 }
-uselistorder ptr @_ZN3jxl6StatusC2ENS_10StatusCodeE, { 3, 2, 1, 0 }
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp 
b/llvm/lib/CodeGen/InlineSpiller.cpp
index fab04c44371e9..768c4a733639f 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -1054,9 +1054,8 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, 
unsigned>> Ops,
     --NumSpills;
   SlotIndex FoldIdx = LIS.ReplaceMachineInstrInMaps(*MI, *FoldMI);
   if (CopyMI) {
-    LIS.InsertMachineInstrInMaps(*CopyMI);
+    SlotIndex CopyIdx = LIS.InsertMachineInstrInMaps(*CopyMI).getRegSlot();
     if (!MRI.isSSA()) {
-      SlotIndex CopyIdx = LIS.InsertMachineInstrInMaps(*CopyMI).getRegSlot();
       LiveInterval &LI = LIS.getInterval(CopyMI->getOperand(0).getReg());
       VNInfo *VNI = LI.getNextValue(CopyIdx, LIS.getVNInfoAllocator());
       LI.addSegment(LiveRange::Segment(CopyIdx, FoldIdx.getRegSlot(), VNI));
@@ -1103,7 +1102,7 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, 
unsigned>> Ops,
   // Insert any new instructions other than FoldMI into the LIS maps.
   assert(!MIS.empty() && "Unexpected empty span of instructions!");
   for (MachineInstr &MI : MIS)
-    if (&MI != FoldMI)
+    if (&MI != FoldMI && &MI != CopyMI)
       LIS.InsertMachineInstrInMaps(MI);
 
   // TII.foldMemoryOperand may have left some implicit operands on the
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp 
b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 638adf18997ba..22ea4a08f3957 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -160,11 +160,9 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   SlotIndex FoldIdx = LIS.ReplaceMachineInstrInMaps(*UseMI, *FoldMI);
   if (CopyMI) {
     SlotIndex CopyIdx = LIS.InsertMachineInstrInMaps(*CopyMI).getRegSlot();
-    if (!MRI.isSSA()) {
-      LiveInterval &LI = LIS.getInterval(CopyMI->getOperand(0).getReg());
-      VNInfo *VNI = LI.getNextValue(CopyIdx, LIS.getVNInfoAllocator());
-      LI.addSegment(LiveRange::Segment(CopyIdx, FoldIdx.getRegSlot(), VNI));
-    }
+    LiveInterval &LI = LIS.getInterval(CopyMI->getOperand(0).getReg());
+    VNInfo *VNI = LI.getNextValue(CopyIdx, LIS.getVNInfoAllocator());
+    LI.addSegment(LiveRange::Segment(CopyIdx, FoldIdx.getRegSlot(), VNI));
   }
   // Update the call info.
   if (UseMI->shouldUpdateAdditionalCallInfo())
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp 
b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 50e0bfcac9321..2f6f2eafa1005 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -7600,13 +7600,22 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
 
       const TargetRegisterClass &RC = *MF.getRegInfo().getRegClass(SrcReg);
       MachineRegisterInfo &MRI = MF.getRegInfo();
-      Register NewSrc = MRI.isSSA() ? MRI.createVirtualRegister(&RC)
-                                    : MI.getOperand(0).getReg();
+      Register SubReg = X86::NoSubRegister;
+      Register NewSrc = X86::NoSubRegister;
+
+      if (MRI.isSSA()) {
+        NewSrc = MRI.createVirtualRegister(&RC);
+      } else {
+        NewSrc = MI.getOperand(0).getReg();
+        SubReg = MI.getOperand(0).getSubReg();
+      }
+
       CopyMI = BuildMI(*NewMI->getParent(), *NewMI, MI.getDebugLoc(),
                        get(TargetOpcode::COPY))
-                   .addReg(NewSrc, RegState::Define)
+                   .addReg(NewSrc, RegState::Define, SubReg)
                    .addReg(SrcReg);
       NewMI->getOperand(1).setReg(NewSrc);
+      NewMI->getOperand(1).setSubReg(SubReg);
     }
     return NewMI;
   }
diff --git a/llvm/test/CodeGen/X86/apx/or.ll b/llvm/test/CodeGen/X86/apx/or.ll
index 3bb2c4041308f..0a6e98e89f3fc 100644
--- a/llvm/test/CodeGen/X86/apx/or.ll
+++ b/llvm/test/CodeGen/X86/apx/or.ll
@@ -1241,7 +1241,7 @@ entry:
   ret void
 }
 
-define i64 @pr191165(i32 %0, ptr %1) {
+define i64 @pr191165(i32 %a, ptr %b) {
 ; NDD-LABEL: pr191165:
 ; NDD:       # %bb.0:
 ; NDD-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
@@ -1272,9 +1272,9 @@ define i64 @pr191165(i32 %0, ptr %1) {
 ; NF-NEXT:    movl %eax, 0 # encoding: [0x89,0x04,0x25,0x00,0x00,0x00,0x00]
 ; NF-NEXT:    movslq %edi, %rax # encoding: [0x48,0x63,0xc7]
 ; NF-NEXT:    retq # encoding: [0xc3]
-  %3 = load i32, ptr %1, align 4
-  %4 = or i32 %3, %0
-  store volatile i32 %4, ptr null, align 4
-  %5 = sext i32 %0 to i64
-  ret i64 %5
+  %x = load i32, ptr %b, align 4
+  %y = or i32 %x, %a
+  store volatile i32 %y, ptr null, align 4
+  %z = sext i32 %a to i64
+  ret i64 %z
 }
diff --git a/llvm/test/CodeGen/X86/apx/pr191368.ll 
b/llvm/test/CodeGen/X86/apx/pr191368.ll
new file mode 100644
index 0000000000000..fe3964ee22fd9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/pr191368.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-pc-windows-gnu -mattr=ndd | FileCheck %s
+
+define ptr @foo(ptr %a, i32 %b, ptr %c, ptr %d, i32 %e, i32 %f, i32 %g) 
nounwind {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %l0
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    pushq %rsi
+; CHECK-NEXT:    pushq %rdi
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    movq %r9, %r14
+; CHECK-NEXT:    movq %r8, %rbx
+; CHECK-NEXT:    movl %edx, %esi
+; CHECK-NEXT:    movq %rcx, %rdi
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r15d
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r13d
+; CHECK-NEXT:    xorl %ebp, %ebp
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    callq bitstob
+; CHECK-NEXT:    cmpl $-1, %r15d
+; CHECK-NEXT:    movl $-1, %r12d
+; CHECK-NEXT:    cmovll %r15d, %r12d
+; CHECK-NEXT:    cmpl $5, %r13d
+; CHECK-NEXT:    ja .LBB0_5
+; CHECK-NEXT:  # %bb.1: # %l0
+; CHECK-NEXT:    movl $3, %eax
+; CHECK-NEXT:    btl %r13d, %eax
+; CHECK-NEXT:    jb .LBB0_4
+; CHECK-NEXT:  # %bb.2: # %l0
+; CHECK-NEXT:    movl $20, %eax
+; CHECK-NEXT:    btl %r13d, %eax
+; CHECK-NEXT:    jb .LBB0_6
+; CHECK-NEXT:  # %bb.3: # %l1
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %eax, %ebp
+; CHECK-NEXT:  .LBB0_4: # %.sink.split
+; CHECK-NEXT:    movl %ebp, (%r14)
+; CHECK-NEXT:  .LBB0_5: # %l2
+; CHECK-NEXT:    callq __rv_alloc_D2A
+; CHECK-NEXT:    movl $0, (%rbx)
+; CHECK-NEXT:    movl %r12d, %eax
+; CHECK-NEXT:    subl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq 0, %rcx
+; CHECK-NEXT:    leal 1(%rax,%r15), %eax
+; CHECK-NEXT:    movl %eax, (%rdi)
+; CHECK-NEXT:    callq __Bfree_D2A
+; CHECK-NEXT:    movl %esi, (%rdi)
+; CHECK-NEXT:  .LBB0_6: # %common.ret1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    popq %rdi
+; CHECK-NEXT:    popq %rsi
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    retq
+l0:
+  %x = tail call ptr @bitstob(ptr null)
+  %y = tail call i32 @llvm.smin.i32(i32 %g, i32 -1)
+  switch i32 %e, label %l2 [
+    i32 0, label %.sink.split
+    i32 1, label %.sink.split
+    i32 2, label %common.ret1
+    i32 4, label %common.ret1
+    i32 3, label %l1
+    i32 5, label %l1
+  ]
+
+common.ret1:                                      ; preds = %l2, %7, %7
+  ret ptr null
+
+l1:                                               ; preds = %7, %7
+  br label %.sink.split
+
+.sink.split:                                      ; preds = %7, %7, %l1
+  %.sink = phi i32 [ %f, %l1 ], [ 0, %l0 ], [ 0, %l0 ]
+  store i32 %.sink, ptr %d, align 4
+  br label %l2
+
+l2:                                               ; preds = %.sink.split, %7
+  %b2 = tail call ptr @__rv_alloc_D2A()
+  store i32 0, ptr %c, align 4
+  %reass.sub = sub i32 %y, %f
+  %b3 = add i32 %reass.sub, 1
+  %b4 = load volatile ptr, ptr null, align 4294967296
+  %b5 = add i32 %b3, %g
+  store i32 %b5, ptr %a, align 4
+  tail call void @__Bfree_D2A()
+  store i32 %b, ptr %a, align 4
+  br label %common.ret1
+}
+
+declare ptr @bitstob(ptr)
+declare void @__Bfree_D2A()
+declare ptr @__rv_alloc_D2A()
+declare i32 @llvm.smin.i32(i32, i32)

>From 02a1bdfbc6f6e8d370688a4c692712e40609136e Mon Sep 17 00:00:00 2001
From: Phoebe Wang <[email protected]>
Date: Fri, 10 Apr 2026 22:04:25 +0800
Subject: [PATCH 4/8] Change SubReg type and add llc test

---
 llvm/lib/Target/X86/X86InstrInfo.cpp  |   4 +-
 llvm/test/CodeGen/X86/apx/pr191368.ll | 121 +++++++++++++++++++++++++-
 2 files changed, 119 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp 
b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 2f6f2eafa1005..479fdef016dfd 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -7600,8 +7600,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
 
       const TargetRegisterClass &RC = *MF.getRegInfo().getRegClass(SrcReg);
       MachineRegisterInfo &MRI = MF.getRegInfo();
-      Register SubReg = X86::NoSubRegister;
-      Register NewSrc = X86::NoSubRegister;
+      unsigned SubReg = X86::NoSubRegister;
+      Register NewSrc;
 
       if (MRI.isSSA()) {
         NewSrc = MRI.createVirtualRegister(&RC);
diff --git a/llvm/test/CodeGen/X86/apx/pr191368.ll 
b/llvm/test/CodeGen/X86/apx/pr191368.ll
index fe3964ee22fd9..e29110f00a035 100644
--- a/llvm/test/CodeGen/X86/apx/pr191368.ll
+++ b/llvm/test/CodeGen/X86/apx/pr191368.ll
@@ -74,18 +74,18 @@ l0:
     i32 5, label %l1
   ]
 
-common.ret1:                                      ; preds = %l2, %7, %7
+common.ret1:                                      ; preds = %l2, %l0, %l0
   ret ptr null
 
-l1:                                               ; preds = %7, %7
+l1:                                               ; preds = %l0, %l0
   br label %.sink.split
 
-.sink.split:                                      ; preds = %7, %7, %l1
+.sink.split:                                      ; preds = %l0, %l0, %l1
   %.sink = phi i32 [ %f, %l1 ], [ 0, %l0 ], [ 0, %l0 ]
   store i32 %.sink, ptr %d, align 4
   br label %l2
 
-l2:                                               ; preds = %.sink.split, %7
+l2:                                               ; preds = %.sink.split, %l0
   %b2 = tail call ptr @__rv_alloc_D2A()
   store i32 0, ptr %c, align 4
   %reass.sub = sub i32 %y, %f
@@ -102,3 +102,116 @@ declare ptr @bitstob(ptr)
 declare void @__Bfree_D2A()
 declare ptr @__rv_alloc_D2A()
 declare i32 @llvm.smin.i32(i32, i32)
+
+define i32 @pr190962(ptr %a, ptr %b, ptr %c, i64 %d, i64 %e, i64 %f) nounwind {
+; CHECK-LABEL: pr190962:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    pushq %rsi
+; CHECK-NEXT:    pushq %rdi
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    subq $56, %rsp
+; CHECK-NEXT:    movq %r9, %rdi
+; CHECK-NEXT:    movq %rdx, %rbx
+; CHECK-NEXT:    movq %rcx, %rsi
+; CHECK-NEXT:    callq f2
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    imulq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT:    testq %rax, %rax
+; CHECK-NEXT:    je .LBB1_3
+; CHECK-NEXT:  # %bb.1: # %l1
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; CHECK-NEXT:    movq 0, %r15
+; CHECK-NEXT:    callq f2
+; CHECK-NEXT:    movq 0, %r12
+; CHECK-NEXT:    xorl %ebp, %ebp
+; CHECK-NEXT:    orq %r13, %r15
+; CHECK-NEXT:    jne .LBB1_2
+; CHECK-NEXT:  # %bb.4: # %l2
+; CHECK-NEXT:    movq %rax, %r14
+; CHECK-NEXT:    movq %rbx, %rcx
+; CHECK-NEXT:    callq f4
+; CHECK-NEXT:    movl $1, %ebp
+; CHECK-NEXT:    orq %r12, %r14
+; CHECK-NEXT:    je .LBB1_5
+; CHECK-NEXT:  .LBB1_2: # %common.ret1.sink.split
+; CHECK-NEXT:    callq f3
+; CHECK-NEXT:    # implicit-def: $ecx
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    xorl %r8d, %r8d
+; CHECK-NEXT:    movl %ebp, %r9d
+; CHECK-NEXT:    callq f1
+; CHECK-NEXT:    callq f3
+; CHECK-NEXT:  .LBB1_3: # %common.ret1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    addq $56, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    popq %rdi
+; CHECK-NEXT:    popq %rsi
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB1_5: # %l3
+; CHECK-NEXT:    testq %rdi, %rdi
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT:    je .LBB1_3
+; CHECK-NEXT:  # %bb.6: # %l4
+; CHECK-NEXT:    movl $0, (%rsi)
+; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    xorl %r8d, %r8d
+; CHECK-NEXT:    xorl %r9d, %r9d
+; CHECK-NEXT:    callq f1
+  %g = tail call i64 @f2()
+  %h = mul i64 %e, %d
+  %i = icmp eq i64 %h, 0
+  br i1 %i, label %common.ret1, label %l1
+
+common.ret1.sink.split:                           ; preds = %l2, %l1
+  %.sink = phi i32 [ 0, %l1 ], [ 1, %l2 ]
+  tail call void @f3()
+  %b0 = tail call i32 (i32, ptr, ...) @f1(i32 undef, ptr null, ptr null, i32 
%.sink)
+  tail call void @f3()
+  br label %common.ret1
+
+common.ret1:                                      ; preds = 
%common.ret1.sink.split, %l3, %6
+  ret i32 0
+
+l1:                                               ; preds = %6
+  %b2 = load volatile i64, ptr null, align 4294967296
+  %b3 = tail call i64 @f2()
+  %b4 = load volatile i64, ptr null, align 4294967296
+  %b5 = or i64 %b2, %f
+  %.not = icmp eq i64 %b5, 0
+  br i1 %.not, label %l2, label %common.ret1.sink.split
+
+l2:                                               ; preds = %l1
+  %b7 = tail call i64 @f4(ptr %b)
+  %b8 = or i64 %b4, %b3
+  %.not1 = icmp eq i64 %b8, 0
+  br i1 %.not1, label %l3, label %common.ret1.sink.split
+
+l3:                                               ; preds = %l2
+  %c0 = icmp eq i64 %d, 0
+  br i1 %c0, label %common.ret1, label %l4
+
+l4:                                               ; preds = %l3
+  store i32 0, ptr %a, align 4
+  %c2 = tail call i32 (i32, ptr, ...) @f1(i32 0, ptr null, ptr null, i32 0, 
i64 0, i64 0, i64 %e)
+  unreachable
+}
+
+declare i32 @f1(i32, ptr, ...)
+declare i64 @f2()
+declare void @f3()
+declare i64 @f4(ptr)

>From de2145aedeed611c47189a0da14f2a48f4deb6fa Mon Sep 17 00:00:00 2001
From: Phoebe Wang <[email protected]>
Date: Fri, 10 Apr 2026 23:08:53 +0800
Subject: [PATCH 5/8] Fix source subreg issue

---
 llvm/lib/Target/X86/X86InstrInfo.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp 
b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 479fdef016dfd..e932e9ad1c837 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -7595,6 +7595,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
 
     if (NoNDDM && !IsTwoAddr) {
       Register SrcReg = MI.getOperand(1).getReg();
+      unsigned SrcSub = MI.getOperand(1).getSubReg();
       if (MI.killsRegister(SrcReg, /*TRI=*/nullptr))
         return NewMI;
 
@@ -7613,7 +7614,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
       CopyMI = BuildMI(*NewMI->getParent(), *NewMI, MI.getDebugLoc(),
                        get(TargetOpcode::COPY))
                    .addReg(NewSrc, RegState::Define, SubReg)
-                   .addReg(SrcReg);
+                   .addReg(SrcReg, {}, SrcSub);
       NewMI->getOperand(1).setReg(NewSrc);
       NewMI->getOperand(1).setSubReg(SubReg);
     }

>From 57676eb9a900b158c74d3b8919aab5978320b483 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <[email protected]>
Date: Sat, 11 Apr 2026 11:14:07 +0800
Subject: [PATCH 6/8] Fix machine verifier failure and address comments

---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h |  1 +
 llvm/lib/CodeGen/InlineSpiller.cpp          |  5 ++++
 llvm/lib/CodeGen/LiveRangeEdit.cpp          | 14 ++++++-----
 llvm/lib/Target/X86/X86InstrInfo.cpp        | 27 ++++++++++++---------
 llvm/test/CodeGen/X86/apx/pr191368.ll       | 10 ++++----
 5 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h 
b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index e69f377fd0bd4..e08f2a524ead1 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1254,6 +1254,7 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
   /// operand folded, otherwise NULL is returned.
   /// The new instruction is inserted before MI, and the client is responsible
   /// for removing the old instruction.
+  /// If a copy instruction being created during fold, return it by CopyMI.
   /// If VRM is passed, the assigned physregs can be inspected by target to
   /// decide on using an opcode (note that those assignments can still change).
   MachineInstr *foldMemoryOperand(MachineInstr &MI, ArrayRef<unsigned> Ops,
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp 
b/llvm/lib/CodeGen/InlineSpiller.cpp
index 768c4a733639f..352e1961332ad 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -1105,6 +1105,11 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, 
unsigned>> Ops,
     if (&MI != FoldMI && &MI != CopyMI)
       LIS.InsertMachineInstrInMaps(MI);
 
+  if (CopyMI) {
+    LiveInterval &LI = LIS.getInterval(CopyMI->getOperand(1).getReg());
+    LIS.shrinkToUses(&LI);
+  }
+
   // TII.foldMemoryOperand may have left some implicit operands on the
   // instruction.  Strip them.
   if (ImpReg)
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp 
b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 22ea4a08f3957..0141d940d498f 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -158,12 +158,6 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
     return false;
   LLVM_DEBUG(dbgs() << "                folded: " << *FoldMI);
   SlotIndex FoldIdx = LIS.ReplaceMachineInstrInMaps(*UseMI, *FoldMI);
-  if (CopyMI) {
-    SlotIndex CopyIdx = LIS.InsertMachineInstrInMaps(*CopyMI).getRegSlot();
-    LiveInterval &LI = LIS.getInterval(CopyMI->getOperand(0).getReg());
-    VNInfo *VNI = LI.getNextValue(CopyIdx, LIS.getVNInfoAllocator());
-    LI.addSegment(LiveRange::Segment(CopyIdx, FoldIdx.getRegSlot(), VNI));
-  }
   // Update the call info.
   if (UseMI->shouldUpdateAdditionalCallInfo())
     UseMI->getMF()->moveAdditionalCallInfo(UseMI, FoldMI);
@@ -171,6 +165,14 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   DefMI->addRegisterDead(LI->reg(), nullptr);
   Dead.push_back(DefMI);
   ++NumDCEFoldedLoads;
+  if (CopyMI) {
+    SlotIndex CopyIdx = LIS.InsertMachineInstrInMaps(*CopyMI).getRegSlot();
+    LiveInterval &LI = LIS.getInterval(CopyMI->getOperand(0).getReg());
+    VNInfo *VNI = LI.getNextValue(CopyIdx, LIS.getVNInfoAllocator());
+    LI.addSegment(LiveRange::Segment(CopyIdx, FoldIdx.getRegSlot(), VNI));
+    LiveInterval &SrcLI = LIS.getInterval(CopyMI->getOperand(1).getReg());
+    LIS.shrinkToUses(&SrcLI);
+  }
   return true;
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp 
b/llvm/lib/Target/X86/X86InstrInfo.cpp
index e932e9ad1c837..796b10f1ef798 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -7543,6 +7543,18 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   // Utilize the mapping NonNDD if NDD memory variant is not preferred.
   bool NoNDDM = NonNDOpc && !Subtarget.hasNDDM();
 
+  // Bail out if dst has subreg. It happens during register-coalescer from
+  // 704B  %19:gr32 = SUB32rr_ND killed %0:gr32, killed %7:gr32, ...
+  // 752B  undef %23.sub_32bit:gr64 = COPY killed %19:gr32
+  // 768B  %25:gr32 = LEA64_32r killed %23:gr64, 1, killed %21:gr64_nosp, ...
+  // to
+  // 704B  undef %23.sub_32bit:gr64_with_sub_8bit = SUB32rr_ND %0:gr32, ...
+  // 768B  %25:gr32 = LEA64_32r %23:gr64_with_sub_8bit, 1, %21:gr64_nosp, ...
+  // Machine verifier fails if we try to tie %23 to the source.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  if (NoNDDM && !MRI.isSSA() && MI.getOperand(0).getSubReg())
+    return nullptr;
+
   const X86FoldTableEntry *I =
       IsTwoAddr ? lookupTwoAddrFoldTable(NonNDOpc ? NonNDOpc : Opc)
                 : lookupFoldTable(NoNDDM ? NonNDOpc : Opc, OpNum);
@@ -7600,23 +7612,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
         return NewMI;
 
       const TargetRegisterClass &RC = *MF.getRegInfo().getRegClass(SrcReg);
-      MachineRegisterInfo &MRI = MF.getRegInfo();
-      unsigned SubReg = X86::NoSubRegister;
-      Register NewSrc;
-
-      if (MRI.isSSA()) {
-        NewSrc = MRI.createVirtualRegister(&RC);
-      } else {
-        NewSrc = MI.getOperand(0).getReg();
-        SubReg = MI.getOperand(0).getSubReg();
-      }
+      Register NewSrc = MRI.isSSA() ? MRI.createVirtualRegister(&RC) :
+                                      MI.getOperand(0).getReg();
 
       CopyMI = BuildMI(*NewMI->getParent(), *NewMI, MI.getDebugLoc(),
                        get(TargetOpcode::COPY))
-                   .addReg(NewSrc, RegState::Define, SubReg)
+                   .addDef(NewSrc)
                    .addReg(SrcReg, {}, SrcSub);
       NewMI->getOperand(1).setReg(NewSrc);
-      NewMI->getOperand(1).setSubReg(SubReg);
     }
     return NewMI;
   }
diff --git a/llvm/test/CodeGen/X86/apx/pr191368.ll 
b/llvm/test/CodeGen/X86/apx/pr191368.ll
index e29110f00a035..b6a87b590d98c 100644
--- a/llvm/test/CodeGen/X86/apx/pr191368.ll
+++ b/llvm/test/CodeGen/X86/apx/pr191368.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 6
-; RUN: llc < %s -mtriple=x86_64-pc-windows-gnu -mattr=ndd | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-windows-gnu -mattr=ndd 
-verify-machineinstrs | FileCheck %s
 
 define ptr @foo(ptr %a, i32 %b, ptr %c, ptr %d, i32 %e, i32 %f, i32 %g) 
nounwind {
 ; CHECK-LABEL: foo:
@@ -43,8 +43,8 @@ define ptr @foo(ptr %a, i32 %b, ptr %c, ptr %d, i32 %e, i32 
%f, i32 %g) nounwind
 ; CHECK-NEXT:  .LBB0_5: # %l2
 ; CHECK-NEXT:    callq __rv_alloc_D2A
 ; CHECK-NEXT:    movl $0, (%rbx)
-; CHECK-NEXT:    movl %r12d, %eax
-; CHECK-NEXT:    subl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    subl %eax, %r12d, %eax
 ; CHECK-NEXT:    movq 0, %rcx
 ; CHECK-NEXT:    leal 1(%rax,%r15), %eax
 ; CHECK-NEXT:    movl %eax, (%rdi)
@@ -140,7 +140,7 @@ define i32 @pr190962(ptr %a, ptr %b, ptr %c, i64 %d, i64 
%e, i64 %f) nounwind {
 ; CHECK-NEXT:    je .LBB1_5
 ; CHECK-NEXT:  .LBB1_2: # %common.ret1.sink.split
 ; CHECK-NEXT:    callq f3
-; CHECK-NEXT:    # implicit-def: $ecx
+; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    xorl %r8d, %r8d
 ; CHECK-NEXT:    movl %ebp, %r9d
@@ -180,7 +180,7 @@ define i32 @pr190962(ptr %a, ptr %b, ptr %c, i64 %d, i64 
%e, i64 %f) nounwind {
 common.ret1.sink.split:                           ; preds = %l2, %l1
   %.sink = phi i32 [ 0, %l1 ], [ 1, %l2 ]
   tail call void @f3()
-  %b0 = tail call i32 (i32, ptr, ...) @f1(i32 undef, ptr null, ptr null, i32 
%.sink)
+  %b0 = tail call i32 (i32, ptr, ...) @f1(i32 0, ptr null, ptr null, i32 
%.sink)
   tail call void @f3()
   br label %common.ret1
 

>From d7c1407b6621c998f62aade0bdbb5b00791cc7c7 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <[email protected]>
Date: Sat, 11 Apr 2026 11:20:24 +0800
Subject: [PATCH 7/8] clang-format

---
 llvm/lib/Target/X86/X86InstrInfo.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp 
b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 796b10f1ef798..b4ae52b2b5700 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -7612,8 +7612,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
         return NewMI;
 
       const TargetRegisterClass &RC = *MF.getRegInfo().getRegClass(SrcReg);
-      Register NewSrc = MRI.isSSA() ? MRI.createVirtualRegister(&RC) :
-                                      MI.getOperand(0).getReg();
+      Register NewSrc = MRI.isSSA() ? MRI.createVirtualRegister(&RC)
+                                    : MI.getOperand(0).getReg();
 
       CopyMI = BuildMI(*NewMI->getParent(), *NewMI, MI.getDebugLoc(),
                        get(TargetOpcode::COPY))

>From bca46ef2833e4b5e7f7e9457a075aae569e9eda3 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <[email protected]>
Date: Sat, 11 Apr 2026 11:23:58 +0800
Subject: [PATCH 8/8] Add comment for optimizeLoadInstr too

---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h 
b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index e08f2a524ead1..e013a3e256664 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1842,7 +1842,8 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
   /// whether it can be folded into MI. FoldAsLoadDefReg is the virtual 
register
   /// defined by the load we are trying to fold. DefMI returns the machine
   /// instruction that defines FoldAsLoadDefReg, and the function returns
-  /// the machine instruction generated due to folding.
+  /// the machine instruction generated due to folding. CopyMI returns the
+  /// copy instruction possibly generated due to folding.
   virtual MachineInstr *optimizeLoadInstr(MachineInstr &MI,
                                           const MachineRegisterInfo *MRI,
                                           Register &FoldAsLoadDefReg,

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to