https://github.com/PankajDwivedi-25 created https://github.com/llvm/llvm-project/pull/175904
Reverts llvm/llvm-project#169345 to address the reviews >From 44014689178470a5e327b26ff6255dc064eac39d Mon Sep 17 00:00:00 2001 From: Pankaj Dwivedi <[email protected]> Date: Wed, 14 Jan 2026 14:15:59 +0530 Subject: [PATCH] =?UTF-8?q?Revert=20"[AMDGPU][SIInsertWaitcnt]=20Implement?= =?UTF-8?q?=20Waitcnt=20Expansion=20for=20Profiling=20(=E2=80=A6"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 3dfb782333bf929945f63e5b0b1cad378b0bd87a. --- clang/include/clang/Basic/CodeGenOptions.def | 4 - clang/include/clang/Options/Options.td | 7 - clang/lib/CodeGen/Targets/AMDGPU.cpp | 2 - llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 295 ++---- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 19 - llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 20 - .../AMDGPU/expand-waitcnt-profiling.ll | 944 ------------------ 7 files changed, 92 insertions(+), 1199 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index baf8b093c10e6..6cdbffc456193 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -466,10 +466,6 @@ CODEGENOPT(AAPCSBitfieldWidth, 1, 1, Benign) /// propagate signaling NaN inputs per IEEE 754-2008 (AMDGPU Only) CODEGENOPT(EmitIEEENaNCompliantInsts, 1, 1, Benign) -/// Enable expanded waitcnt for profiling (AMDGPU Only) -/// Expands s_waitcnt instructions to help PC-sampling profilers identify stalls. -CODEGENOPT(AMDGPUExpandWaitcntProfiling, 1, 0, Benign) - // Whether to emit Swift Async function extended frame information: auto, // never, always. ENUM_CODEGENOPT(SwiftAsyncFramePointer, SwiftAsyncFramePointerKind, 2, diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td index 2f57a5b13b917..5ad0ff2a773c8 100644 --- a/clang/include/clang/Options/Options.td +++ b/clang/include/clang/Options/Options.td @@ -5585,13 +5585,6 @@ defm amdgpu_ieee : BoolMOption<"amdgpu-ieee", "This option changes the ABI. (AMDGPU only)">, NegFlag<SetFalse, [], [ClangOption, CC1Option]>>; -defm amdgpu_expand_waitcnt_profiling : BoolMOption<"amdgpu-expand-waitcnt-profiling", - CodeGenOpts<"AMDGPUExpandWaitcntProfiling">, DefaultFalse, - PosFlag<SetTrue, [], [ClangOption, CC1Option], "Expand s_waitcnt instructions to help " - "PC-sampling profilers identify memory stalls. Instead of a single waitcnt(target), " - "emits waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target). (AMDGPU only)">, - NegFlag<SetFalse, [], [ClangOption]>>; - def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, Group<m_Group>, HelpText<"Specify code object ABI version. Defaults to 6. (AMDGPU only)">, Visibility<[ClangOption, FlangOption, CC1Option, FC1Option]>, diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 4bc9557b26b52..0ab6c753b8bad 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -443,8 +443,6 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes( setFunctionDeclAttributes(FD, F, M); if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts) F->addFnAttr("amdgpu-ieee", "false"); - if (getABIInfo().getCodeGenOpts().AMDGPUExpandWaitcntProfiling) - F->addFnAttr("amdgpu-expand-waitcnt-profiling"); } unsigned AMDGPUTargetCodeGenInfo::getDeviceKernelCallingConv() const { diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index b3e834b66ad45..bf842e0ecb4af 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -105,35 +105,6 @@ auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) { return enum_seq(LOAD_CNT, MaxCounter); } -// Get the maximum wait count value for a given counter type. -static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits, - InstCounterType T) { - switch (T) { - case LOAD_CNT: - return Limits.LoadcntMax; - case DS_CNT: - return Limits.DscntMax; - case EXP_CNT: - return Limits.ExpcntMax; - case STORE_CNT: - return Limits.StorecntMax; - case SAMPLE_CNT: - return Limits.SamplecntMax; - case BVH_CNT: - return Limits.BvhcntMax; - case KM_CNT: - return Limits.KmcntMax; - case X_CNT: - return Limits.XcntMax; - case VA_VDST: - return Limits.VaVdstMax; - case VM_VSRC: - return Limits.VmVsrcMax; - default: - return 0; - } -} - /// Integer IDs used to track vector memory locations we may have to wait on. /// Encoded as u16 chunks: /// @@ -169,6 +140,19 @@ static constexpr VMEMID toVMEMID(MCRegUnit RU) { return static_cast<unsigned>(RU); } +struct HardwareLimits { + unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12. + unsigned ExpcntMax; + unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12. + unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11. + unsigned SamplecntMax; // gfx12+ only. + unsigned BvhcntMax; // gfx12+ only. + unsigned KmcntMax; // gfx12+ only. + unsigned XcntMax; // gfx1250. + unsigned VaVdstMax; // gfx12+ expert mode only. + unsigned VmVsrcMax; // gfx12+ expert mode only. +}; + #define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \ DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \ DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \ @@ -330,27 +314,19 @@ class WaitcntGenerator { AMDGPU::IsaVersion IV; InstCounterType MaxCounter; bool OptNone; - bool ExpandWaitcntProfiling = false; - const AMDGPU::HardwareLimits *Limits = nullptr; public: WaitcntGenerator() = default; - WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter, - const AMDGPU::HardwareLimits *Limits) + WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter) : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()), IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter), OptNone(MF.getFunction().hasOptNone() || - MF.getTarget().getOptLevel() == CodeGenOptLevel::None), - ExpandWaitcntProfiling( - MF.getFunction().hasFnAttribute("amdgpu-expand-waitcnt-profiling")), - Limits(Limits) {} + MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {} // Return true if the current function should be compiled with no // optimization. bool isOptNone() const { return OptNone; } - const AMDGPU::HardwareLimits &getLimits() const { return *Limits; } - // Edits an existing sequence of wait count instructions according // to an incoming Waitcnt value, which is itself updated to reflect // any new wait count instructions which may need to be generated by @@ -372,11 +348,9 @@ class WaitcntGenerator { // Generates new wait count instructions according to the value of // Wait, returning true if any new instructions were created. - // If ScoreBrackets is provided, it can be used for profiling expansion. virtual bool createNewWaitcnt(MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait, - WaitcntBrackets *ScoreBrackets = nullptr) = 0; + AMDGPU::Waitcnt Wait) = 0; // Returns an array of bit masks which can be used to map values in // WaitEventType to corresponding counter values in InstCounterType. @@ -401,10 +375,7 @@ class WaitcntGenerator { class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { public: - WaitcntGeneratorPreGFX12() = default; - WaitcntGeneratorPreGFX12(const MachineFunction &MF, - const AMDGPU::HardwareLimits *Limits) - : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS, Limits) {} + using WaitcntGenerator::WaitcntGenerator; bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, @@ -413,8 +384,7 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { bool createNewWaitcnt(MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait, - WaitcntBrackets *ScoreBrackets = nullptr) override; + AMDGPU::Waitcnt Wait) override; const unsigned *getWaitEventMask() const override { assert(ST); @@ -446,10 +416,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { public: WaitcntGeneratorGFX12Plus() = default; WaitcntGeneratorGFX12Plus(const MachineFunction &MF, - InstCounterType MaxCounter, - const AMDGPU::HardwareLimits *Limits, - bool IsExpertMode) - : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {} + InstCounterType MaxCounter, bool IsExpertMode) + : WaitcntGenerator(MF, MaxCounter), IsExpertMode(IsExpertMode) {} bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, @@ -458,8 +426,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { bool createNewWaitcnt(MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait, - WaitcntBrackets *ScoreBrackets = nullptr) override; + AMDGPU::Waitcnt Wait) override; const unsigned *getWaitEventMask() const override { assert(ST); @@ -533,7 +500,7 @@ class SIInsertWaitcnts { // message. DenseSet<MachineInstr *> ReleaseVGPRInsts; - AMDGPU::HardwareLimits Limits; + HardwareLimits Limits; public: SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT, @@ -544,7 +511,33 @@ class SIInsertWaitcnts { (void)ForceVMCounter; } - const AMDGPU::HardwareLimits &getLimits() const { return Limits; } + unsigned getWaitCountMax(InstCounterType T) const { + switch (T) { + case LOAD_CNT: + return Limits.LoadcntMax; + case DS_CNT: + return Limits.DscntMax; + case EXP_CNT: + return Limits.ExpcntMax; + case STORE_CNT: + return Limits.StorecntMax; + case SAMPLE_CNT: + return Limits.SamplecntMax; + case BVH_CNT: + return Limits.BvhcntMax; + case KM_CNT: + return Limits.KmcntMax; + case X_CNT: + return Limits.XcntMax; + case VA_VDST: + return Limits.VaVdstMax; + case VM_VSRC: + return Limits.VmVsrcMax; + default: + break; + } + return 0; + } PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *ML, const WaitcntBrackets &Brackets); @@ -776,7 +769,7 @@ class WaitcntBrackets { unsigned getPendingGDSWait() const { return std::min(getScoreUB(DS_CNT) - LastGDS, - getWaitCountMax(Context->getLimits(), DS_CNT) - 1); + Context->getWaitCountMax(DS_CNT) - 1); } void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; } @@ -803,8 +796,8 @@ class WaitcntBrackets { } void setStateOnFunctionEntryOrReturn() { - setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + - getWaitCountMax(Context->getLimits(), STORE_CNT)); + setScoreUB(STORE_CNT, + getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT)); PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT]; } @@ -860,9 +853,8 @@ class WaitcntBrackets { if (T != EXP_CNT) return; - if (getScoreRange(EXP_CNT) > getWaitCountMax(Context->getLimits(), EXP_CNT)) - ScoreLBs[EXP_CNT] = - ScoreUBs[EXP_CNT] - getWaitCountMax(Context->getLimits(), EXP_CNT); + if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT)) + ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT); } void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) { @@ -1365,8 +1357,8 @@ void WaitcntBrackets::determineWaitForScore(InstCounterType T, } else { // If a counter has been maxed out avoid overflow by waiting for // MAX(CounterType) - 1 instead. - unsigned NeededWait = std::min( - UB - ScoreToWait, getWaitCountMax(Context->getLimits(), T) - 1); + unsigned NeededWait = + std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1); addWait(Wait, T, NeededWait); } } @@ -1683,109 +1675,38 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( /// required counters in \p Wait bool WaitcntGeneratorPreGFX12::createNewWaitcnt( MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) { + AMDGPU::Waitcnt Wait) { assert(ST); assert(isNormalMode(MaxCounter)); bool Modified = false; const DebugLoc &DL = Block.findDebugLoc(It); - // Helper to emit expanded waitcnt sequence for profiling. - // Emits waitcnts from (Outstanding-1) down to Target, or just Target if - // nothing to expand. The EmitWaitcnt callback emits a single waitcnt. - auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target, - auto EmitWaitcnt) { - if (Outstanding > Target) { - for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) { - EmitWaitcnt(i); - Modified = true; - } - } else { - EmitWaitcnt(Target); - Modified = true; - } - }; - // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a // single instruction while VScnt has its own instruction. if (Wait.hasWaitExceptStoreCnt()) { - // If profiling expansion is enabled and we have score brackets, - // emit an expanded sequence - if (ExpandWaitcntProfiling && ScoreBrackets) { - // Check if any of the counters to be waited on are out-of-order. - // If so, fall back to normal (non-expanded) behavior since expansion - // would provide misleading profiling information. - bool AnyOutOfOrder = false; - for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) { - unsigned &WaitCnt = getCounterRef(Wait, CT); - if (WaitCnt != ~0u && ScoreBrackets->counterOutOfOrder(CT)) { - AnyOutOfOrder = true; - break; - } - } - - if (AnyOutOfOrder) { - // Fall back to non-expanded wait - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + [[maybe_unused]] auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); - Modified = true; - } else { - // All counters are in-order, safe to expand - for (auto CT : {LOAD_CNT, DS_CNT, EXP_CNT}) { - unsigned &WaitCnt = getCounterRef(Wait, CT); - if (WaitCnt == ~0u) - continue; - - unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) - - ScoreBrackets->getScoreLB(CT), - getWaitCountMax(getLimits(), CT) - 1); - EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](unsigned Count) { - AMDGPU::Waitcnt W; - getCounterRef(W, CT) = Count; - BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)) - .addImm(AMDGPU::encodeWaitcnt(IV, W)); - }); - } - } - } else { - // Normal behavior: emit single combined waitcnt - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - [[maybe_unused]] auto SWaitInst = - BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); - Modified = true; + Modified = true; - LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n"; - if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; - dbgs() << "New Instr: " << *SWaitInst << '\n'); - } + LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); } if (Wait.hasWaitStoreCnt()) { assert(ST->hasVscnt()); - if (ExpandWaitcntProfiling && ScoreBrackets && Wait.StoreCnt != ~0u && - !ScoreBrackets->counterOutOfOrder(STORE_CNT)) { - // Only expand if counter is not out-of-order - unsigned Outstanding = - std::min(ScoreBrackets->getScoreUB(STORE_CNT) - - ScoreBrackets->getScoreLB(STORE_CNT), - getWaitCountMax(getLimits(), STORE_CNT) - 1); - EmitExpandedWaitcnt(Outstanding, Wait.StoreCnt, [&](unsigned Count) { + [[maybe_unused]] auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(Count); - }); - } else { - [[maybe_unused]] auto SWaitInst = - BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) - .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(Wait.StoreCnt); - Modified = true; + .addImm(Wait.StoreCnt); + Modified = true; - LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n"; - if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; - dbgs() << "New Instr: " << *SWaitInst << '\n'); - } + LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); } return Modified; @@ -2082,55 +2003,13 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) { + AMDGPU::Waitcnt Wait) { assert(ST); assert(!isNormalMode(MaxCounter)); bool Modified = false; const DebugLoc &DL = Block.findDebugLoc(It); - // Helper to emit expanded waitcnt sequence for profiling. - auto EmitExpandedWaitcnt = [&](unsigned Outstanding, unsigned Target, - auto EmitWaitcnt) { - if (Outstanding > Target) { - for (unsigned i = Outstanding - 1; i >= Target && i != ~0u; --i) { - EmitWaitcnt(i); - Modified = true; - } - } else { - EmitWaitcnt(Target); - Modified = true; - } - }; - - // For GFX12+, we use separate wait instructions, which makes expansion - // simpler - if (ExpandWaitcntProfiling && ScoreBrackets) { - for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { - unsigned Count = getWait(Wait, CT); - if (Count == ~0u) - continue; - - // Skip expansion for out-of-order counters - emit normal wait instead - if (ScoreBrackets->counterOutOfOrder(CT)) { - BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT])) - .addImm(Count); - Modified = true; - continue; - } - - unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) - - ScoreBrackets->getScoreLB(CT), - getWaitCountMax(getLimits(), CT) - 1); - EmitExpandedWaitcnt(Outstanding, Count, [&](unsigned Val) { - BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT])) - .addImm(Val); - }); - } - return Modified; - } - - // Normal behavior (no expansion) // Check for opportunities to use combined wait instructions. if (Wait.DsCnt != ~0u) { MachineInstr *SWaitInst = nullptr; @@ -2529,7 +2408,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, Modified = WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It); - AMDGPU::Waitcnt WaitForScore = Wait; + // Any counts that could have been applied to any existing waitcnt + // instructions will have been done so, now deal with any remaining. + ScoreBrackets.applyWaitcnt(Wait); // ExpCnt can be merged into VINTERP. if (Wait.ExpCnt != ~0u && It != Block.instr_end() && @@ -2546,13 +2427,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, << "Update Instr: " << *It); } - if (WCG->createNewWaitcnt(Block, It, Wait, &ScoreBrackets)) + if (WCG->createNewWaitcnt(Block, It, Wait)) Modified = true; - // Any counts that could have been applied to any existing waitcnt - // instructions will have been done so, now deal with any remaining. - ScoreBrackets.applyWaitcnt(WaitForScore); - return Modified; } @@ -3259,9 +3136,6 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU()); - // Initialize hardware limits first, as they're needed by the generators. - Limits = AMDGPU::HardwareLimits(IV, ST->hasExtendedWaitCounts()); - if (ST->hasExtendedWaitCounts()) { IsExpertMode = ST->hasExpertSchedulingMode() && (ExpertSchedulingModeFlag.getNumOccurrences() @@ -3270,12 +3144,11 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { .getFnAttribute("amdgpu-expert-scheduling-mode") .getValueAsBool()); MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS; - WCGGFX12Plus = - WaitcntGeneratorGFX12Plus(MF, MaxCounter, &Limits, IsExpertMode); + WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter, IsExpertMode); WCG = &WCGGFX12Plus; } else { MaxCounter = NUM_NORMAL_INST_CNTS; - WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, &Limits); + WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, MaxCounter); WCG = &WCGPreGFX12; } @@ -3286,6 +3159,22 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS); + if (ST->hasExtendedWaitCounts()) { + Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV); + Limits.DscntMax = AMDGPU::getDscntBitMask(IV); + } else { + Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV); + Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV); + } + Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); + Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV); + Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV); + Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV); + Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV); + Limits.XcntMax = AMDGPU::getXcntBitMask(IV); + Limits.VaVdstMax = AMDGPU::DepCtr::getVaVdstBitMask(); + Limits.VmVsrcMax = AMDGPU::DepCtr::getVmVsrcBitMask(); + BlockInfos.clear(); bool Modified = false; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index a6017f57714d4..ef384999851e9 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1785,25 +1785,6 @@ unsigned getStorecntBitMask(const IsaVersion &Version) { return (1 << getStorecntBitWidth(Version.Major)) - 1; } -HardwareLimits::HardwareLimits(const IsaVersion &IV, - bool HasExtendedWaitCounts) { - if (HasExtendedWaitCounts) { - LoadcntMax = getLoadcntBitMask(IV); - DscntMax = getDscntBitMask(IV); - } else { - LoadcntMax = getVmcntBitMask(IV); - DscntMax = getLgkmcntBitMask(IV); - } - ExpcntMax = getExpcntBitMask(IV); - StorecntMax = getStorecntBitMask(IV); - SamplecntMax = getSamplecntBitMask(IV); - BvhcntMax = getBvhcntBitMask(IV); - KmcntMax = getKmcntBitMask(IV); - XcntMax = getXcntBitMask(IV); - VaVdstMax = DepCtr::getVaVdstBitMask(); - VmVsrcMax = DepCtr::getVmVsrcBitMask(); -} - unsigned getWaitcntBitMask(const IsaVersion &Version) { unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(Version.Major), getVmcntBitWidthLo(Version.Major)); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 770f9a86dc883..f6b95602644ca 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1131,26 +1131,6 @@ struct Waitcnt { friend raw_ostream &operator<<(raw_ostream &OS, const AMDGPU::Waitcnt &Wait); }; -/// Represents the hardware counter limits for different wait count types. -struct HardwareLimits { - unsigned LoadcntMax; // Corresponds to Vmcnt prior to gfx12. - unsigned ExpcntMax; - unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12. - unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11. - unsigned SamplecntMax; // gfx12+ only. - unsigned BvhcntMax; // gfx12+ only. - unsigned KmcntMax; // gfx12+ only. - unsigned XcntMax; // gfx1250. - unsigned VaVdstMax; // gfx12+ expert mode only. - unsigned VmVsrcMax; // gfx12+ expert mode only. - - HardwareLimits() = default; - - /// Initializes hardware limits from ISA version. - /// \p HasExtendedWaitCounts should be true for gfx12+. - HardwareLimits(const IsaVersion &IV, bool HasExtendedWaitCounts); -}; - // The following methods are only meaningful on targets that support // S_WAITCNT. diff --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll deleted file mode 100644 index 848a9d07084ed..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll +++ /dev/null @@ -1,944 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx900 | FileCheck --check-prefix=GFX9-EXPAND %s -; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx900 | FileCheck --check-prefix=GFX9-NOEXPAND %s -; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1010 | FileCheck --check-prefix=GFX10-EXPAND %s -; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1010 | FileCheck --check-prefix=GFX10-NOEXPAND %s -; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1100 | FileCheck --check-prefix=GFX11-EXPAND %s -; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1100 | FileCheck --check-prefix=GFX11-NOEXPAND %s -; RUN: sed 's/ATTRS/\"amdgpu-expand-waitcnt-profiling\"/g' %s | llc -mtriple=amdgcn -mcpu=gfx1200 | FileCheck --check-prefix=GFX12-EXPAND %s -; RUN: sed 's/ATTRS//g' %s | llc -mtriple=amdgcn -mcpu=gfx1200 | FileCheck --check-prefix=GFX12-NOEXPAND %s - -; When -amdgpu-expand-waitcnt-profiling is enabled and there are N outstanding -; operations, instead of emitting a single waitcnt(target), we emit: -; waitcnt(N-1), waitcnt(N-2), ..., waitcnt(target) -; -; This allows PC-sampling profilers to identify which specific operation -; is causing a stall by observing where the program counter is stuck. - -define amdgpu_kernel void @test_lgkmcnt_scalar_loads(ptr addrspace(4) %ptr_a, ptr addrspace(4) %ptr_b, ptr addrspace(4) %ptr_c, ptr addrspace(1) %out) #0 { -; GFX9-EXPAND-LABEL: test_lgkmcnt_scalar_loads: -; GFX9-EXPAND: ; %bb.0: -; GFX9-EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX9-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 -; GFX9-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 -; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s1 -; GFX9-EXPAND-NEXT: s_add_i32 s0, s0, s2 -; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[14:15] -; GFX9-EXPAND-NEXT: s_endpgm -; -; GFX9-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: -; GFX9-NOEXPAND: ; %bb.0: -; GFX9-NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX9-NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 -; GFX9-NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 -; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s0, s2 -; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15] -; GFX9-NOEXPAND-NEXT: s_endpgm -; -; GFX10-EXPAND-LABEL: test_lgkmcnt_scalar_loads: -; GFX10-EXPAND: ; %bb.0: -; GFX10-EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX10-EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 -; GFX10-EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 -; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s1 -; GFX10-EXPAND-NEXT: s_add_i32 s0, s0, s2 -; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[14:15] -; GFX10-EXPAND-NEXT: s_endpgm -; -; GFX10-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: -; GFX10-NOEXPAND: ; %bb.0: -; GFX10-NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX10-NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 -; GFX10-NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 -; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 -; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s0, s2 -; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15] -; GFX10-NOEXPAND-NEXT: s_endpgm -; -; GFX11-EXPAND-LABEL: test_lgkmcnt_scalar_loads: -; GFX11-EXPAND: ; %bb.0: -; GFX11-EXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1 -; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s2 -; GFX11-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX11-EXPAND-NEXT: s_endpgm -; -; GFX11-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: -; GFX11-NOEXPAND: ; %bb.0: -; GFX11-NOEXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 -; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s2 -; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX11-NOEXPAND-NEXT: s_endpgm -; -; GFX12-EXPAND-LABEL: test_lgkmcnt_scalar_loads: -; GFX12-EXPAND: ; %bb.0: -; GFX12-EXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 -; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 -; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1 -; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s2 -; GFX12-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-EXPAND-NEXT: s_endpgm -; -; GFX12-NOEXPAND-LABEL: test_lgkmcnt_scalar_loads: -; GFX12-NOEXPAND: ; %bb.0: -; GFX12-NOEXPAND-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 -; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 -; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s1 -; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s2 -; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[6:7] -; GFX12-NOEXPAND-NEXT: s_endpgm - - %val_a = load i32, ptr addrspace(4) %ptr_a, align 4 - %val_b = load i32, ptr addrspace(4) %ptr_b, align 4 - %val_c = load i32, ptr addrspace(4) %ptr_c, align 4 - %sum1 = add i32 %val_a, %val_b - %sum2 = add i32 %sum1, %val_c - store i32 %sum2, ptr addrspace(1) %out, align 4 - ret void -} - -define amdgpu_kernel void @test_vmcnt_global_loads(ptr addrspace(1) %buf, ptr addrspace(1) %out) #0 { -; GFX9-EXPAND-LABEL: test_vmcnt_global_loads: -; GFX9-EXPAND: ; %bb.0: -; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-EXPAND-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-EXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 -; GFX9-EXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 -; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(2) -; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(1) -; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(0) -; GFX9-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 -; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[2:3] -; GFX9-EXPAND-NEXT: s_endpgm -; -; GFX9-NOEXPAND-LABEL: test_vmcnt_global_loads: -; GFX9-NOEXPAND: ; %bb.0: -; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NOEXPAND-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NOEXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 -; GFX9-NOEXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 -; GFX9-NOEXPAND-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 -; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[2:3] -; GFX9-NOEXPAND-NEXT: s_endpgm -; -; GFX10-EXPAND-LABEL: test_vmcnt_global_loads: -; GFX10-EXPAND: ; %bb.0: -; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-EXPAND-NEXT: s_clause 0x2 -; GFX10-EXPAND-NEXT: global_load_dword v1, v0, s[0:1] -; GFX10-EXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 -; GFX10-EXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 -; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(2) -; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(1) -; GFX10-EXPAND-NEXT: s_waitcnt vmcnt(0) -; GFX10-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 -; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[2:3] -; GFX10-EXPAND-NEXT: s_endpgm -; -; GFX10-NOEXPAND-LABEL: test_vmcnt_global_loads: -; GFX10-NOEXPAND: ; %bb.0: -; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NOEXPAND-NEXT: s_clause 0x2 -; GFX10-NOEXPAND-NEXT: global_load_dword v1, v0, s[0:1] -; GFX10-NOEXPAND-NEXT: global_load_dword v2, v0, s[0:1] offset:256 -; GFX10-NOEXPAND-NEXT: global_load_dword v3, v0, s[0:1] offset:512 -; GFX10-NOEXPAND-NEXT: s_waitcnt vmcnt(0) -; GFX10-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 -; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[2:3] -; GFX10-NOEXPAND-NEXT: s_endpgm -; -; GFX11-EXPAND-LABEL: test_vmcnt_global_loads: -; GFX11-EXPAND: ; %bb.0: -; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-EXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-EXPAND-NEXT: s_clause 0x2 -; GFX11-EXPAND-NEXT: global_load_b32 v1, v0, s[0:1] -; GFX11-EXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 -; GFX11-EXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 -; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(2) -; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(1) -; GFX11-EXPAND-NEXT: s_waitcnt vmcnt(0) -; GFX11-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 -; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-EXPAND-NEXT: s_endpgm -; -; GFX11-NOEXPAND-LABEL: test_vmcnt_global_loads: -; GFX11-NOEXPAND: ; %bb.0: -; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NOEXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOEXPAND-NEXT: s_clause 0x2 -; GFX11-NOEXPAND-NEXT: global_load_b32 v1, v0, s[0:1] -; GFX11-NOEXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 -; GFX11-NOEXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 -; GFX11-NOEXPAND-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 -; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX11-NOEXPAND-NEXT: s_endpgm -; -; GFX12-EXPAND-LABEL: test_vmcnt_global_loads: -; GFX12-EXPAND: ; %bb.0: -; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-EXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 -; GFX12-EXPAND-NEXT: s_clause 0x2 -; GFX12-EXPAND-NEXT: global_load_b32 v1, v0, s[0:1] -; GFX12-EXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 -; GFX12-EXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 -; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x2 -; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x1 -; GFX12-EXPAND-NEXT: s_wait_loadcnt 0x0 -; GFX12-EXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 -; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-EXPAND-NEXT: s_endpgm -; -; GFX12-NOEXPAND-LABEL: test_vmcnt_global_loads: -; GFX12-NOEXPAND: ; %bb.0: -; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NOEXPAND-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 -; GFX12-NOEXPAND-NEXT: s_clause 0x2 -; GFX12-NOEXPAND-NEXT: global_load_b32 v1, v0, s[0:1] -; GFX12-NOEXPAND-NEXT: global_load_b32 v2, v0, s[0:1] offset:256 -; GFX12-NOEXPAND-NEXT: global_load_b32 v3, v0, s[0:1] offset:512 -; GFX12-NOEXPAND-NEXT: s_wait_loadcnt 0x0 -; GFX12-NOEXPAND-NEXT: v_add3_u32 v1, v1, v2, v3 -; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-NOEXPAND-NEXT: s_endpgm - - ; Use thread ID to create thread-varying addresses -> forces vector loads - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %tid64 = zext i32 %tid to i64 - - ; Three separate global loads with thread-varying addresses - ; Non-volatile loads allow multiple operations to be in-flight - %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 %tid64 - %val0 = load i32, ptr addrspace(1) %ptr0, align 4 - - %offset1 = add i64 %tid64, 64 - %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset1 - %val1 = load i32, ptr addrspace(1) %ptr1, align 4 - - %offset2 = add i64 %tid64, 128 - %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset2 - %val2 = load i32, ptr addrspace(1) %ptr2, align 4 - - %sum1 = add i32 %val0, %val1 - %sum2 = add i32 %sum1, %val2 - - %out_ptr = getelementptr i32, ptr addrspace(1) %out, i64 %tid64 - store i32 %sum2, ptr addrspace(1) %out_ptr, align 4 - ret void -} - -declare i32 @llvm.amdgcn.workitem.id.x() - -define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr, ptr addrspace(1) %out) #0 { -; GFX9-EXPAND-LABEL: test_lgkmcnt_lds_operations: -; GFX9-EXPAND: ; %bb.0: -; GFX9-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 -; GFX9-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX9-EXPAND-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 -; GFX9-EXPAND-NEXT: ds_read_b32 v2, v2 offset:8 -; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-EXPAND-NEXT: global_store_dword v3, v0, s[0:1] -; GFX9-EXPAND-NEXT: s_endpgm -; -; GFX9-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: -; GFX9-NOEXPAND: ; %bb.0: -; GFX9-NOEXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 -; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 -; GFX9-NOEXPAND-NEXT: ds_read_b32 v2, v2 offset:8 -; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NOEXPAND-NEXT: global_store_dword v3, v0, s[0:1] -; GFX9-NOEXPAND-NEXT: s_endpgm -; -; GFX10-EXPAND-LABEL: test_lgkmcnt_lds_operations: -; GFX10-EXPAND: ; %bb.0: -; GFX10-EXPAND-NEXT: s_clause 0x1 -; GFX10-EXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 -; GFX10-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-EXPAND-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-EXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 -; GFX10-EXPAND-NEXT: ds_read_b32 v2, v2 offset:8 -; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(1) -; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-EXPAND-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-EXPAND-NEXT: s_endpgm -; -; GFX10-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: -; GFX10-NOEXPAND: ; %bb.0: -; GFX10-NOEXPAND-NEXT: s_clause 0x1 -; GFX10-NOEXPAND-NEXT: s_load_dword s2, s[4:5], 0x24 -; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 -; GFX10-NOEXPAND-NEXT: ds_read_b32 v2, v2 offset:8 -; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1) -; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NOEXPAND-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-NOEXPAND-NEXT: s_endpgm -; -; GFX11-EXPAND-LABEL: test_lgkmcnt_lds_operations: -; GFX11-EXPAND: ; %bb.0: -; GFX11-EXPAND-NEXT: s_clause 0x1 -; GFX11-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 -; GFX11-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c -; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-EXPAND-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 -; GFX11-EXPAND-NEXT: ds_load_b32 v2, v2 offset:8 -; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(1) -; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 -; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX11-EXPAND-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-EXPAND-NEXT: s_endpgm -; -; GFX11-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: -; GFX11-NOEXPAND: ; %bb.0: -; GFX11-NOEXPAND-NEXT: s_clause 0x1 -; GFX11-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 -; GFX11-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c -; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 -; GFX11-NOEXPAND-NEXT: ds_load_b32 v2, v2 offset:8 -; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(1) -; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 -; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX11-NOEXPAND-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NOEXPAND-NEXT: s_endpgm -; -; GFX12-EXPAND-LABEL: test_lgkmcnt_lds_operations: -; GFX12-EXPAND: ; %bb.0: -; GFX12-EXPAND-NEXT: s_clause 0x1 -; GFX12-EXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 -; GFX12-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c -; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 -; GFX12-EXPAND-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 -; GFX12-EXPAND-NEXT: ds_load_b32 v2, v2 offset:8 -; GFX12-EXPAND-NEXT: s_wait_dscnt 0x1 -; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 -; GFX12-EXPAND-NEXT: s_wait_dscnt 0x0 -; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX12-EXPAND-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-EXPAND-NEXT: s_endpgm -; -; GFX12-NOEXPAND-LABEL: test_lgkmcnt_lds_operations: -; GFX12-NOEXPAND: ; %bb.0: -; GFX12-NOEXPAND-NEXT: s_clause 0x1 -; GFX12-NOEXPAND-NEXT: s_load_b32 s2, s[4:5], 0x24 -; GFX12-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c -; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 -; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 -; GFX12-NOEXPAND-NEXT: ds_load_b32 v2, v2 offset:8 -; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x1 -; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 -; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x0 -; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX12-NOEXPAND-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX12-NOEXPAND-NEXT: s_endpgm - - %ptr0 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 0 - %ptr1 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 1 - %ptr2 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 2 - %val0 = load i32, ptr addrspace(3) %ptr0, align 4 - %val1 = load i32, ptr addrspace(3) %ptr1, align 4 - %val2 = load i32, ptr addrspace(3) %ptr2, align 4 - %sum1 = add i32 %val0, %val1 - %sum2 = add i32 %sum1, %val2 - store i32 %sum2, ptr addrspace(1) %out, align 4 - ret void -} - -define amdgpu_kernel void @test_combined_vmcnt_lgkmcnt(ptr addrspace(4) %scalar_ptr_a, ptr addrspace(4) %scalar_ptr_b, ptr addrspace(1) %out) #0 { -; GFX9-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: -; GFX9-EXPAND: ; %bb.0: -; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 -; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-EXPAND-NEXT: s_add_i32 s0, s4, s5 -; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[6:7] -; GFX9-EXPAND-NEXT: s_endpgm -; -; GFX9-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: -; GFX9-NOEXPAND: ; %bb.0: -; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 -; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NOEXPAND-NEXT: s_add_i32 s0, s4, s5 -; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7] -; GFX9-NOEXPAND-NEXT: s_endpgm -; -; GFX10-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: -; GFX10-EXPAND: ; %bb.0: -; GFX10-EXPAND-NEXT: s_clause 0x1 -; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 -; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-EXPAND-NEXT: s_add_i32 s0, s4, s5 -; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[6:7] -; GFX10-EXPAND-NEXT: s_endpgm -; -; GFX10-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: -; GFX10-NOEXPAND: ; %bb.0: -; GFX10-NOEXPAND-NEXT: s_clause 0x1 -; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 -; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NOEXPAND-NEXT: s_add_i32 s0, s4, s5 -; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7] -; GFX10-NOEXPAND-NEXT: s_endpgm -; -; GFX11-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: -; GFX11-EXPAND: ; %bb.0: -; GFX11-EXPAND-NEXT: s_clause 0x1 -; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-EXPAND-NEXT: s_add_i32 s0, s0, s1 -; GFX11-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-EXPAND-NEXT: s_endpgm -; -; GFX11-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: -; GFX11-NOEXPAND: ; %bb.0: -; GFX11-NOEXPAND-NEXT: s_clause 0x1 -; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NOEXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOEXPAND-NEXT: s_add_i32 s0, s0, s1 -; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX11-NOEXPAND-NEXT: s_endpgm -; -; GFX12-EXPAND-LABEL: test_combined_vmcnt_lgkmcnt: -; GFX12-EXPAND: ; %bb.0: -; GFX12-EXPAND-NEXT: s_clause 0x1 -; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-EXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 -; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-EXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 -; GFX12-EXPAND-NEXT: s_add_co_i32 s0, s0, s1 -; GFX12-EXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-EXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX12-EXPAND-NEXT: s_endpgm -; -; GFX12-NOEXPAND-LABEL: test_combined_vmcnt_lgkmcnt: -; GFX12-NOEXPAND: ; %bb.0: -; GFX12-NOEXPAND-NEXT: s_clause 0x1 -; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NOEXPAND-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 -; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-NOEXPAND-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 -; GFX12-NOEXPAND-NEXT: s_add_co_i32 s0, s0, s1 -; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[4:5] -; GFX12-NOEXPAND-NEXT: s_endpgm - - %scalar_val1 = load i32, ptr addrspace(4) %scalar_ptr_a, align 4 - %scalar_val2 = load i32, ptr addrspace(4) %scalar_ptr_b, align 4 - - %result = add i32 %scalar_val1, %scalar_val2 - store i32 %result, ptr addrspace(1) %out, align 4 - ret void -} - -; Test that expansion is NOT applied when counters are out-of-order (mixed event types). -; In pre-GFX12, LDS and SMEM operations both use DS_CNT (lgkmcnt), but they can complete -; out-of-order relative to each other. When both are in-flight, we should NOT expand -; because the expansion would be misleading. -define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_ptr, ptr addrspace(4) %smem_ptr, ptr addrspace(1) %out) #0 { -; GFX9-EXPAND-LABEL: test_outoforder_lds_and_smem: -; GFX9-EXPAND: ; %bb.0: -; GFX9-EXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 -; GFX9-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-EXPAND-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-EXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 -; GFX9-EXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-EXPAND-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-EXPAND-NEXT: global_store_dword v2, v0, s[2:3] -; GFX9-EXPAND-NEXT: s_endpgm -; -; GFX9-NOEXPAND-LABEL: test_outoforder_lds_and_smem: -; GFX9-NOEXPAND: ; %bb.0: -; GFX9-NOEXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 -; GFX9-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 -; GFX9-NOEXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NOEXPAND-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-NOEXPAND-NEXT: global_store_dword v2, v0, s[2:3] -; GFX9-NOEXPAND-NEXT: s_endpgm -; -; GFX10-EXPAND-LABEL: test_outoforder_lds_and_smem: -; GFX10-EXPAND: ; %bb.0: -; GFX10-EXPAND-NEXT: s_clause 0x1 -; GFX10-EXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 -; GFX10-EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-EXPAND-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-EXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-EXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 -; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX10-EXPAND-NEXT: global_store_dword v1, v0, s[2:3] -; GFX10-EXPAND-NEXT: s_endpgm -; -; GFX10-NOEXPAND-LABEL: test_outoforder_lds_and_smem: -; GFX10-NOEXPAND: ; %bb.0: -; GFX10-NOEXPAND-NEXT: s_clause 0x1 -; GFX10-NOEXPAND-NEXT: s_load_dword s6, s[4:5], 0x24 -; GFX10-NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NOEXPAND-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-NOEXPAND-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 -; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX10-NOEXPAND-NEXT: global_store_dword v1, v0, s[2:3] -; GFX10-NOEXPAND-NEXT: s_endpgm -; -; GFX11-EXPAND-LABEL: test_outoforder_lds_and_smem: -; GFX11-EXPAND: ; %bb.0: -; GFX11-EXPAND-NEXT: s_clause 0x1 -; GFX11-EXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 -; GFX11-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-EXPAND-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 -; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 -; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX11-EXPAND-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-EXPAND-NEXT: s_endpgm -; -; GFX11-NOEXPAND-LABEL: test_outoforder_lds_and_smem: -; GFX11-NOEXPAND: ; %bb.0: -; GFX11-NOEXPAND-NEXT: s_clause 0x1 -; GFX11-NOEXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 -; GFX11-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 -; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 -; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX11-NOEXPAND-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX11-NOEXPAND-NEXT: s_endpgm -; -; GFX12-EXPAND-LABEL: test_outoforder_lds_and_smem: -; GFX12-EXPAND: ; %bb.0: -; GFX12-EXPAND-NEXT: s_clause 0x1 -; GFX12-EXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 -; GFX12-EXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 -; GFX12-EXPAND-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-EXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-EXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 -; GFX12-EXPAND-NEXT: s_wait_dscnt 0x0 -; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 -; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 -; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-EXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-EXPAND-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-EXPAND-NEXT: s_endpgm -; -; GFX12-NOEXPAND-LABEL: test_outoforder_lds_and_smem: -; GFX12-NOEXPAND: ; %bb.0: -; GFX12-NOEXPAND-NEXT: s_clause 0x1 -; GFX12-NOEXPAND-NEXT: s_load_b32 s6, s[4:5], 0x24 -; GFX12-NOEXPAND-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 -; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-NOEXPAND-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX12-NOEXPAND-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 -; GFX12-NOEXPAND-NEXT: s_wait_dscnt 0x0 -; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 -; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 -; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NOEXPAND-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-NOEXPAND-NEXT: global_store_b32 v1, v0, s[2:3] -; GFX12-NOEXPAND-NEXT: s_endpgm - - %lds_val1 = load i32, ptr addrspace(3) %lds_ptr, align 4 - %smem_val = load i32, ptr addrspace(4) %smem_ptr, align 4 - %lds_ptr2 = getelementptr i32, ptr addrspace(3) %lds_ptr, i32 1 - %lds_val2 = load i32, ptr addrspace(3) %lds_ptr2, align 4 - %sum1 = add i32 %lds_val1, %lds_val2 - %sum2 = add i32 %sum1, %smem_val - store i32 %sum2, ptr addrspace(1) %out, align 4 - ret void -} - -define amdgpu_kernel void @test_vscnt_global_stores(ptr addrspace(1) %buf) #0 { -; Test vector memory stores (STORE_CNT/vscnt on GFX10-11, storecnt on GFX12+) -; GFX9-EXPAND-LABEL: test_vscnt_global_stores: -; GFX9-EXPAND: ; %bb.0: ; %entry -; GFX9-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, 2 -; GFX9-EXPAND-NEXT: v_mov_b32_e32 v2, 1 -; GFX9-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-EXPAND-NEXT: global_store_dword v0, v2, s[0:1] -; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:256 -; GFX9-EXPAND-NEXT: v_mov_b32_e32 v1, 3 -; GFX9-EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:512 -; GFX9-EXPAND-NEXT: s_waitcnt vmcnt(0) -; GFX9-EXPAND-NEXT: s_endpgm -; -; GFX9-NOEXPAND-LABEL: test_vscnt_global_stores: -; GFX9-NOEXPAND: ; %bb.0: ; %entry -; GFX9-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, 2 -; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v2, 1 -; GFX9-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] -; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:256 -; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v1, 3 -; GFX9-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:512 -; GFX9-NOEXPAND-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOEXPAND-NEXT: s_endpgm -; -; GFX10-EXPAND-LABEL: test_vscnt_global_stores: -; GFX10-EXPAND: ; %bb.0: ; %entry -; GFX10-EXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX10-EXPAND-NEXT: v_mov_b32_e32 v1, 1 -; GFX10-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-EXPAND-NEXT: v_mov_b32_e32 v2, 2 -; GFX10-EXPAND-NEXT: v_mov_b32_e32 v3, 3 -; GFX10-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-EXPAND-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:256 -; GFX10-EXPAND-NEXT: global_store_dword v0, v3, s[0:1] offset:512 -; GFX10-EXPAND-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-EXPAND-NEXT: s_endpgm -; -; GFX10-NOEXPAND-LABEL: test_vscnt_global_stores: -; GFX10-NOEXPAND: ; %bb.0: ; %entry -; GFX10-NOEXPAND-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v1, 1 -; GFX10-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v2, 2 -; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v3, 3 -; GFX10-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:256 -; GFX10-NOEXPAND-NEXT: global_store_dword v0, v3, s[0:1] offset:512 -; GFX10-NOEXPAND-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NOEXPAND-NEXT: s_endpgm -; -; GFX11-EXPAND-LABEL: test_vscnt_global_stores: -; GFX11-EXPAND: ; %bb.0: ; %entry -; GFX11-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-EXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-EXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3 -; GFX11-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-EXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-EXPAND-NEXT: s_clause 0x2 -; GFX11-EXPAND-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-EXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256 -; GFX11-EXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512 -; GFX11-EXPAND-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-EXPAND-NEXT: s_endpgm -; -; GFX11-NOEXPAND-LABEL: test_vscnt_global_stores: -; GFX11-NOEXPAND: ; %bb.0: ; %entry -; GFX11-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NOEXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3 -; GFX11-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOEXPAND-NEXT: s_clause 0x2 -; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256 -; GFX11-NOEXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512 -; GFX11-NOEXPAND-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NOEXPAND-NEXT: s_endpgm -; -; GFX12-EXPAND-LABEL: test_vscnt_global_stores: -; GFX12-EXPAND: ; %bb.0: ; %entry -; GFX12-EXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX12-EXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-EXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3 -; GFX12-EXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-EXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-EXPAND-NEXT: s_wait_kmcnt 0x0 -; GFX12-EXPAND-NEXT: s_clause 0x2 -; GFX12-EXPAND-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-EXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256 -; GFX12-EXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512 -; GFX12-EXPAND-NEXT: global_wb scope:SCOPE_SYS -; GFX12-EXPAND-NEXT: s_wait_storecnt 0x0 -; GFX12-EXPAND-NEXT: s_endpgm -; -; GFX12-NOEXPAND-LABEL: test_vscnt_global_stores: -; GFX12-NOEXPAND: ; %bb.0: ; %entry -; GFX12-NOEXPAND-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-NOEXPAND-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 3 -; GFX12-NOEXPAND-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NOEXPAND-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NOEXPAND-NEXT: s_wait_kmcnt 0x0 -; GFX12-NOEXPAND-NEXT: s_clause 0x2 -; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v2, s[0:1] offset:256 -; GFX12-NOEXPAND-NEXT: global_store_b32 v0, v3, s[0:1] offset:512 -; GFX12-NOEXPAND-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NOEXPAND-NEXT: s_wait_storecnt 0x0 -; GFX12-NOEXPAND-NEXT: s_endpgm -entry: - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %tid64 = zext i32 %tid to i64 - - ; Issue multiple stores - %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 %tid64 - store i32 1, ptr addrspace(1) %ptr0, align 4 - - %offset1 = add i64 %tid64, 64 - %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset1 - store i32 2, ptr addrspace(1) %ptr1, align 4 - - %offset2 = add i64 %tid64, 128 - %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 %offset2 - store i32 3, ptr addrspace(1) %ptr2, align 4 - - ; Memory fence forces wait for all stores - fence release - ret void -} - -define amdgpu_ps void @test_expcnt_exports(float %x, float %y, float %z, float %w) #0 { -; Test export operations (EXP_CNT/expcnt) -; GFX9-EXPAND-LABEL: test_expcnt_exports: -; GFX9-EXPAND: ; %bb.0: ; %entry -; GFX9-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX9-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3 -; GFX9-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0 -; GFX9-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2 -; GFX9-EXPAND-NEXT: exp param0 v4, v4, v4, v4 done -; GFX9-EXPAND-NEXT: s_endpgm -; -; GFX9-NOEXPAND-LABEL: test_expcnt_exports: -; GFX9-NOEXPAND: ; %bb.0: ; %entry -; GFX9-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX9-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3 -; GFX9-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0 -; GFX9-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2 -; GFX9-NOEXPAND-NEXT: exp param0 v4, v4, v4, v4 done -; GFX9-NOEXPAND-NEXT: s_endpgm -; -; GFX10-EXPAND-LABEL: test_expcnt_exports: -; GFX10-EXPAND: ; %bb.0: ; %entry -; GFX10-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX10-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3 -; GFX10-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0 -; GFX10-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2 -; GFX10-EXPAND-NEXT: exp param0 v4, v4, v4, v4 done -; GFX10-EXPAND-NEXT: s_endpgm -; -; GFX10-NOEXPAND-LABEL: test_expcnt_exports: -; GFX10-NOEXPAND: ; %bb.0: ; %entry -; GFX10-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX10-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3 -; GFX10-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0 -; GFX10-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2 -; GFX10-NOEXPAND-NEXT: exp param0 v4, v4, v4, v4 done -; GFX10-NOEXPAND-NEXT: s_endpgm -; -; GFX11-EXPAND-LABEL: test_expcnt_exports: -; GFX11-EXPAND: ; %bb.0: ; %entry -; GFX11-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX11-EXPAND-NEXT: exp mrt0 v0, v1, v2, v3 -; GFX11-EXPAND-NEXT: exp mrt1 v3, v2, v1, v0 -; GFX11-EXPAND-NEXT: exp mrt2 v0, v3, v1, v2 -; GFX11-EXPAND-NEXT: exp invalid_target_32 v4, v4, v4, v4 done -; GFX11-EXPAND-NEXT: s_endpgm -; -; GFX11-NOEXPAND-LABEL: test_expcnt_exports: -; GFX11-NOEXPAND: ; %bb.0: ; %entry -; GFX11-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX11-NOEXPAND-NEXT: exp mrt0 v0, v1, v2, v3 -; GFX11-NOEXPAND-NEXT: exp mrt1 v3, v2, v1, v0 -; GFX11-NOEXPAND-NEXT: exp mrt2 v0, v3, v1, v2 -; GFX11-NOEXPAND-NEXT: exp invalid_target_32 v4, v4, v4, v4 done -; GFX11-NOEXPAND-NEXT: s_endpgm -; -; GFX12-EXPAND-LABEL: test_expcnt_exports: -; GFX12-EXPAND: ; %bb.0: ; %entry -; GFX12-EXPAND-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX12-EXPAND-NEXT: export mrt0 v0, v1, v2, v3 -; GFX12-EXPAND-NEXT: export mrt1 v3, v2, v1, v0 -; GFX12-EXPAND-NEXT: export mrt2 v0, v3, v1, v2 -; GFX12-EXPAND-NEXT: export invalid_target_32 v4, v4, v4, v4 done -; GFX12-EXPAND-NEXT: s_endpgm -; -; GFX12-NOEXPAND-LABEL: test_expcnt_exports: -; GFX12-NOEXPAND: ; %bb.0: ; %entry -; GFX12-NOEXPAND-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX12-NOEXPAND-NEXT: export mrt0 v0, v1, v2, v3 -; GFX12-NOEXPAND-NEXT: export mrt1 v3, v2, v1, v0 -; GFX12-NOEXPAND-NEXT: export mrt2 v0, v3, v1, v2 -; GFX12-NOEXPAND-NEXT: export invalid_target_32 v4, v4, v4, v4 done -; GFX12-NOEXPAND-NEXT: s_endpgm -entry: - ; Multiple MRT exports - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 false, i1 false) - call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %w, float %z, float %y, float %x, i1 false, i1 false) - call void @llvm.amdgcn.exp.f32(i32 2, i32 15, float %x, float %w, float %y, float %z, i1 false, i1 false) - ; Final export with done bit - call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 1.0, float 1.0, float 1.0, i1 true, i1 false) - ret void -} - -declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) - -attributes #0 = { nounwind ATTRS } _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
