https://github.com/optimisan created https://github.com/llvm/llvm-project/pull/130061
None >From 10605a79e1d1c6d1c227b98019fd4a4c568345b8 Mon Sep 17 00:00:00 2001 From: Akshat Oke <akshat....@amd.com> Date: Thu, 6 Mar 2025 04:41:08 +0000 Subject: [PATCH] [AMDGPU][NPM] Port SIInsertWaitcnts to NPM --- llvm/lib/Target/AMDGPU/AMDGPU.h | 9 +- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 +- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 91 +++++++++++++------ llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir | 1 + .../CodeGen/AMDGPU/insert-waitcnts-hang.mir | 1 + .../AMDGPU/vccz-corrupt-bug-workaround.mir | 2 + 7 files changed, 76 insertions(+), 34 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 23b9aa0cf0523..dbd81add85753 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -369,6 +369,13 @@ class SIMemoryLegalizerPass : public PassInfoMixin<SIMemoryLegalizerPass> { MachineFunctionAnalysisManager &MFAM); }; +class SIInsertWaitcntsPass : public PassInfoMixin<SIInsertWaitcntsPass> { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } +}; + FunctionPass *createAMDGPUAnnotateUniformValuesLegacy(); ModulePass *createAMDGPUPrintfRuntimeBinding(); @@ -445,7 +452,7 @@ extern char &AMDGPUInsertDelayAluID; void initializeSIInsertHardClausesPass(PassRegistry &); extern char &SIInsertHardClausesID; -void initializeSIInsertWaitcntsPass(PassRegistry&); +void initializeSIInsertWaitcntsLegacyPass(PassRegistry &); extern char &SIInsertWaitcntsID; void initializeSIFormMemoryClausesLegacyPass(PassRegistry &); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index de959f8a2aa62..c4641cba60e53 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -109,6 +109,7 @@ MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass()) MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass()); MACHINE_FUNCTION_PASS("si-form-memory-clauses", SIFormMemoryClausesPass()) MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass()) +MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass()) MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass()) MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass()) MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass()) @@ -131,7 +132,6 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartial DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass()) DUMMY_MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass()) -DUMMY_MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass()) DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass()) DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass()) // TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index dbe212ad0a216..c3cc1dc6e495b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -535,7 +535,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIAnnotateControlFlowLegacyPass(*PR); initializeAMDGPUInsertDelayAluLegacyPass(*PR); initializeSIInsertHardClausesPass(*PR); - initializeSIInsertWaitcntsPass(*PR); + initializeSIInsertWaitcntsLegacyPass(*PR); initializeSIModeRegisterLegacyPass(*PR); initializeSIWholeQuadModeLegacyPass(*PR); initializeSILowerControlFlowLegacyPass(*PR); @@ -2153,7 +2153,7 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { } addPass(SIMemoryLegalizerPass()); - // TODO: addPass(SIInsertWaitcntsPass()); + addPass(SIInsertWaitcntsPass()); // TODO: addPass(SIModeRegisterPass()); diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ee263f58bcaf2..8951a4144bd68 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -33,6 +33,7 @@ #include "llvm/ADT/Sequence.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePassManager.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/Support/DebugCounter.h" #include "llvm/TargetParser/TargetParser.h" @@ -594,7 +595,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; }; -class SIInsertWaitcnts : public MachineFunctionPass { +class SIInsertWaitcnts { private: const GCNSubtarget *ST = nullptr; const SIInstrInfo *TII = nullptr; @@ -633,9 +634,9 @@ class SIInsertWaitcnts : public MachineFunctionPass { InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS; public: - static char ID; - - SIInsertWaitcnts() : MachineFunctionPass(ID) { + SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT, + AliasAnalysis *AA) + : MLI(MLI), PDT(PDT), AA(AA) { (void)ForceExpCounter; (void)ForceLgkmCounter; (void)ForceVMCounter; @@ -645,20 +646,7 @@ class SIInsertWaitcnts : public MachineFunctionPass { bool isPreheaderToFlush(MachineBasicBlock &MBB, WaitcntBrackets &ScoreBrackets); bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; - bool runOnMachineFunction(MachineFunction &MF) override; - - StringRef getPassName() const override { - return "SI insert wait instructions"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<MachineLoopInfoWrapperPass>(); - AU.addRequired<MachinePostDominatorTreeWrapperPass>(); - AU.addUsedIfAvailable<AAResultsWrapperPass>(); - AU.addPreserved<AAResultsWrapperPass>(); - MachineFunctionPass::getAnalysisUsage(AU); - } + bool run(MachineFunction &MF); bool isForceEmitWaitcnt() const { for (auto T : inst_counter_types()) @@ -742,6 +730,36 @@ class SIInsertWaitcnts : public MachineFunctionPass { WaitcntBrackets &ScoreBrackets); }; +class SIInsertWaitcntsLegacy : public MachineFunctionPass { +public: + static char ID; + SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override { + auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); + auto *PDT = + &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); + AliasAnalysis *AA = nullptr; + if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>()) + AA = &AAR->getAAResults(); + + return SIInsertWaitcnts(MLI, PDT, AA).run(MF); + } + + StringRef getPassName() const override { + return "SI insert wait instructions"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<MachineLoopInfoWrapperPass>(); + AU.addRequired<MachinePostDominatorTreeWrapperPass>(); + AU.addUsedIfAvailable<AAResultsWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + } // end anonymous namespace RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, @@ -1124,19 +1142,19 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { return hasMixedPendingEvents(T); } -INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, - false) +INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts", + false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) -INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, - false) +INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts", + false, false) -char SIInsertWaitcnts::ID = 0; +char SIInsertWaitcntsLegacy::ID = 0; -char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID; +char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID; FunctionPass *llvm::createSIInsertWaitcntsPass() { - return new SIInsertWaitcnts(); + return new SIInsertWaitcntsLegacy(); } static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, @@ -2406,16 +2424,29 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder(); } -bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { +PreservedAnalyses +SIInsertWaitcntsPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF); + auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF); + auto *AA = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF) + .getManager() + .getCachedResult<AAManager>(MF.getFunction()); + + if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF)) + return PreservedAnalyses::all(); + + return getMachineFunctionPassPreservedAnalyses() + .preserveSet<CFGAnalyses>() + .preserve<AAManager>(); +} + +bool SIInsertWaitcnts::run(MachineFunction &MF) { ST = &MF.getSubtarget<GCNSubtarget>(); TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); - PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); - if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>()) - AA = &AAR->getAAResults(); AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU()); diff --git a/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir b/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir index b6dc75db3edc1..0456d5cc463f1 100644 --- a/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -passes=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s # $sgpr30_sgpr31 will hold the return address. We need a waitcnt before SI_CALL so # that the return address is not clobbered in the callee by the outstanding load. diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir index 28d79efc00b0d..2834ca5fa6858 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass si-insert-waitcnts %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes si-insert-waitcnts %s -o - | FileCheck %s --- name: test diff --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir index 17e3d93ed393b..f5321591a3c88 100644 --- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir +++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir @@ -2,6 +2,8 @@ # RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx900 -o - %s | FileCheck %s -check-prefixes=CHECK,GFX9 # RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -o - %s | FileCheck %s # RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s + +# RUN: llc -passes=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s --- # CHECK-LABEL: name: vccz_corrupt_workaround # CHECK: $vcc = V_CMP_EQ_F32 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits