https://github.com/hidekisaito created 
https://github.com/llvm/llvm-project/pull/171948

Add insertDSPreheaderFlushes() to insert S_WAIT_DSCNT 0 in loop preheaders when 
DS wait relaxation was applied.

Assisted-by: Cursor / claude-4.5-opus-high

Depends on https://github.com/llvm/llvm-project/pull/171944

>From 70beea81a01952a7de4cbb0d33c060d9946c05a5 Mon Sep 17 00:00:00 2001
From: Hideki Saito <[email protected]>
Date: Thu, 11 Dec 2025 20:02:23 -0500
Subject: [PATCH] [AMDGPU] Add DS loop preheader flush (3/4)

Add insertDSPreheaderFlushes() to insert S_WAIT_DSCNT 0 in loop preheaders
when DS wait relaxation was applied.

Assisted-by: Cursor / claude-4.5-opus-high
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   | 67 +++++++++++++++++++
 .../AMDGPU/waitcnt-loop-ds-opt-eligible.mir   |  6 +-
 2 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 777491fb58b80..28bc57ed2db4e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -605,6 +605,7 @@ class SIInsertWaitcnts {
   std::optional<unsigned> getOptimalDSWaitCount(MachineBasicBlock *LoopHeader,
                                                 const MachineInstr &MI) const;
   bool applyDSLoopWaitOpt(MachineInstr &MI, AMDGPU::Waitcnt &Wait);
+  bool insertDSPreheaderFlushes(MachineFunction &MF);
 };
 
 // This objects maintains the current score brackets of each wait counter, and
@@ -2904,6 +2905,68 @@ bool SIInsertWaitcnts::applyDSLoopWaitOpt(MachineInstr 
&MI,
   return true;
 }
 
+// Insert DS_CNT flush in preheaders of loops where DS wait relaxation was
+// applied. This is necessary because the relaxed wait counts inside the loop
+// are computed based on the DS loads issued at the end of the previous
+// iteration (via backedge), but the first iteration enters via the preheader.
+// We must ensure all DS loads from the preheader are complete before entering
+// the loop.
+bool SIInsertWaitcnts::insertDSPreheaderFlushes(MachineFunction &MF) {
+  bool Modified = false;
+
+  for (auto &[LoopHeader, Info] : LoopDSWaitOptCache) {
+    if (!Info.Valid || !Info.RelaxationApplied)
+      continue;
+
+    MachineLoop *ML = MLI->getLoopFor(LoopHeader);
+    if (!ML)
+      continue;
+
+    MachineBasicBlock *Preheader = ML->getLoopPreheader();
+    if (!Preheader)
+      continue;
+
+    // Insert s_wait_dscnt 0 at the end of the preheader (before the 
terminator)
+    MachineBasicBlock::iterator InsertPos = Preheader->getFirstTerminator();
+    if (InsertPos == Preheader->end() && !Preheader->empty())
+      InsertPos = std::prev(Preheader->end());
+
+    // Check if there's already a DS wait at this position
+    bool NeedInsert = true;
+    if (InsertPos != Preheader->end() && InsertPos != Preheader->begin()) {
+      auto CheckPos = std::prev(InsertPos);
+      if (CheckPos->getOpcode() == AMDGPU::S_WAIT_DSCNT_soft ||
+          CheckPos->getOpcode() == AMDGPU::S_WAIT_DSCNT) {
+        if (CheckPos->getOperand(0).getImm() == 0)
+          NeedInsert = false;
+        else {
+          // Change existing wait to 0
+          CheckPos->getOperand(0).setImm(0);
+          NeedInsert = false;
+          Modified = true;
+          LLVM_DEBUG(dbgs() << "DS Loop Opt: Changed existing DS_CNT wait to 0"
+                            << " in preheader ";
+                     Preheader->printName(dbgs()); dbgs() << "\n");
+        }
+      }
+    }
+
+    if (NeedInsert) {
+      DebugLoc DL;
+      if (InsertPos != Preheader->end())
+        DL = InsertPos->getDebugLoc();
+      BuildMI(*Preheader, InsertPos, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft))
+          .addImm(0);
+      Modified = true;
+      LLVM_DEBUG(dbgs() << "DS Loop Opt: Inserted DS_CNT flush in preheader ";
+                 Preheader->printName(dbgs()); dbgs() << " for loop at ";
+                 LoopHeader->printName(dbgs()); dbgs() << "\n");
+    }
+  }
+
+  return Modified;
+}
+
 // Return true if it is better to flush the vmcnt counter in the preheader of
 // the given loop. We currently decide to flush in two situations:
 // 1. The loop contains vmem store(s), no vmem load and at least one use of a
@@ -3250,6 +3313,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
       }
     }
   }
+
+  // Insert DS_CNT flushes in preheaders of loops that had wait counts relaxed.
+  Modified |= insertDSPreheaderFlushes(MF);
+
   ReleaseVGPRInsts.clear();
   PreheadersToFlush.clear();
   LoopDSWaitOptCache.clear();
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir 
b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir
index 48fdabf255e6f..e6237338fda5b 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-opt-eligible.mir
@@ -17,6 +17,7 @@
 # DBG: Loop DS Wait Opt: Loop at bb.1 - 16 DS loads, 8 WMMA/MFMA, {{[0-9]+}} 
total insts, eligible
 # DBG: Loop DS Wait Opt: Analyzed loop at bb.1 - 16 DS loads, HasBarrier=1, 
Valid=1
 # DBG: DS Loop Opt: Relaxing DsCnt from 0 to 12 for:
+# DBG: DS Loop Opt: Inserted DS_CNT flush in preheader bb.0 for loop at bb.1
 
 --- |
   define amdgpu_kernel void @ds_loop_eligible() { ret void }
@@ -31,9 +32,10 @@ machineFunctionInfo:
   isEntryFunction: true
   waveLimiter: false
 body: |
+  ; Check preheader: OPT adds S_WAIT_DSCNT 0 flush, NOOPT does not
   ; OPT: bb.0:
-  ; OPT-NOT: S_WAIT_DSCNT
-  ; OPT: S_BRANCH %bb.1
+  ; OPT: S_WAIT_DSCNT_soft 0
+  ; OPT-NEXT: S_BRANCH %bb.1
 
   ; NOOPT: bb.0:
   ; NOOPT-NOT: S_WAIT_DSCNT

_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to