https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/169345
>From beb404722561291859b6bcd7c0615ea7616967d2 Mon Sep 17 00:00:00 2001 From: Pankaj kumar divedi <[email protected]> Date: Mon, 24 Nov 2025 21:00:58 +0530 Subject: [PATCH 1/3] Implement compiler option -mamdgpu-expand-waitcnt-profiling to expand waitcnt instruction --- clang/include/clang/Driver/Options.td | 5 +- clang/lib/Driver/ToolChains/AMDGPU.cpp | 4 + clang/test/Driver/amdgpu-features.c | 6 + llvm/lib/Target/AMDGPU/AMDGPU.td | 4 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 5 + llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 165 ++++++++++++ .../AMDGPU/expand-waitcnt-profiling.ll | 239 ++++++++++++++++++ 7 files changed, 427 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 11e81e032d5fc..c0ba716484b6a 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -5497,7 +5497,10 @@ defm wavefrontsize64 : SimpleMFlag<"wavefrontsize64", " mode (AMDGPU only)">; defm amdgpu_precise_memory_op : SimpleMFlag<"amdgpu-precise-memory-op", "Enable", "Disable", - " precise memory mode (AMDGPU only)">; + " precise memory mode (AMDGPU only)", m_amdgpu_Features_Group>; +defm amdgpu_expand_waitcnt_profiling + : SimpleMFlag<"amdgpu-expand-waitcnt-profiling", "Enable", "Disable", + " waitcnt expansion for profiling (AMDGPU only)", m_amdgpu_Features_Group>; def munsafe_fp_atomics : Flag<["-"], "munsafe-fp-atomics">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, Alias<fatomic_ignore_denormal_mode>; diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 1a243fef9532d..f4ddb48c9abc6 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -700,6 +700,10 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D, options::OPT_mno_amdgpu_precise_memory_op, false)) Features.push_back("+precise-memory"); + if (Args.hasFlag(options::OPT_mamdgpu_expand_waitcnt_profiling, + options::OPT_mno_amdgpu_expand_waitcnt_profiling, false)) + Features.push_back("+expand-waitcnt-profiling"); + handleTargetFeaturesGroup(D, Triple, Args, Features, options::OPT_m_amdgpu_Features_Group); } diff --git a/clang/test/Driver/amdgpu-features.c b/clang/test/Driver/amdgpu-features.c index 864744db203e9..16b3f4121ab7a 100644 --- a/clang/test/Driver/amdgpu-features.c +++ b/clang/test/Driver/amdgpu-features.c @@ -38,3 +38,9 @@ // RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-amdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=NO-PREC-MEM %s // NO-PREC-MEM-NOT: {{".*precise-memory"}} + +// RUN: %clang -### -target amdgcn -mcpu=gfx900 -mamdgpu-expand-waitcnt-profiling %s 2>&1 | FileCheck --check-prefix=EXPAND-WAITCNT %s +// EXPAND-WAITCNT: "-target-feature" "+expand-waitcnt-profiling" + +// RUN: %clang -### -target amdgcn -mcpu=gfx900 -mno-amdgpu-expand-waitcnt-profiling %s 2>&1 | FileCheck --check-prefix=NO-EXPAND-WAITCNT %s +// NO-EXPAND-WAITCNT-NOT: "{{[+]}}expand-waitcnt-profiling" diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 54d94b1f8682e..3f9166f48ea22 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -223,6 +223,10 @@ def FeaturePreciseMemory : SubtargetFeature<"precise-memory", "EnablePreciseMemory", "true", "Enable precise memory mode">; +def FeatureExpandWaitcntProfiling + : SubtargetFeature<"expand-waitcnt-profiling", "EnableExpandWaitcntProfiling", + "true", "Expand waitcnt instructions for profiling">; + def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", "SGPRInitBug", "true", diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index f377b8aaf1333..f2b885a790f41 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -90,6 +90,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool EnableCuMode = false; bool TrapHandler = false; bool EnablePreciseMemory = false; + bool EnableExpandWaitcntProfiling = false; // Used as options. bool EnableLoadStoreOpt = false; @@ -674,6 +675,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; } + bool isExpandWaitcntProfilingEnabled() const { + return EnableExpandWaitcntProfiling; + } + bool hasFlatAddressSpace() const { return FlatAddressSpace; } diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index b7fa899678ec7..4a70479358bad 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -494,6 +494,16 @@ class SIInsertWaitcnts { bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; bool run(MachineFunction &MF); + // Methods for expanding waitcnt instructions for profiling + bool expandWaitcntsForProfiling(MachineFunction &MF); + bool expandSingleWaitcnt(MachineInstr &MI, MachineBasicBlock &MBB); + bool expandSingleCounterWait(MachineInstr &MI, MachineBasicBlock &MBB, + InstCounterType CT); + bool expandCounterSequence(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertPos, + InstCounterType CT, unsigned CountValue, + DebugLoc DL); + void setForceEmitWaitcnt() { // For non-debug builds, ForceEmitWaitcnt has been initialized to false; // For debug builds, get the debug counter info and adjust if need be @@ -2725,6 +2735,156 @@ SIInsertWaitcntsPass::run(MachineFunction &MF, .preserve<AAManager>(); } +/// Expand waitcnt instructions for profiling by inserting a sequence of +/// decreasing counter values. This helps identify which specific memory +/// operation is a bottleneck during PC sampling. +bool SIInsertWaitcnts::expandWaitcntsForProfiling(MachineFunction &MF) { + if (!ST->isExpandWaitcntProfilingEnabled()) + return false; + + bool Modified = false; + + // Iterate through all basic blocks + for (MachineBasicBlock &MBB : MF) { + for (auto I = MBB.begin(), E = MBB.end(); I != E;) { + MachineInstr &MI = *I; + ++I; // Advance iterator before potential expansion + + if (ST->hasExtendedWaitCounts()) { + // GFX12+: Handle separate wait instructions + if (auto CT = counterTypeForInstr(MI.getOpcode())) { + Modified |= expandSingleCounterWait(MI, MBB, *CT); + } + } else { + // Pre-GFX12: Handle combined S_WAITCNT + if (MI.getOpcode() == AMDGPU::S_WAITCNT) { + Modified |= expandSingleWaitcnt(MI, MBB); + } + } + } + } + + return Modified; +} + +/// Expand a single S_WAITCNT instruction (pre-GFX12) +bool SIInsertWaitcnts::expandSingleWaitcnt(MachineInstr &MI, + MachineBasicBlock &MBB) { + assert(MI.getOpcode() == AMDGPU::S_WAITCNT); + + // Decode the waitcnt immediate + unsigned Imm = MI.getOperand(0).getImm(); + AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU()); + AMDGPU::Waitcnt Wait = AMDGPU::decodeWaitcnt(IV, Imm); + + // Insert expanded waitcnts BEFORE the original instruction + auto InsertPos = MI.getIterator(); + DebugLoc DL = MI.getDebugLoc(); + + bool Modified = false; + + // Expand each counter independently + // For independent counters (Case 2 from requirements): + // vmcnt and lgkmcnt can be separated + Modified |= expandCounterSequence(MBB, InsertPos, LOAD_CNT, Wait.LoadCnt, DL); + Modified |= expandCounterSequence(MBB, InsertPos, DS_CNT, Wait.DsCnt, DL); + Modified |= expandCounterSequence(MBB, InsertPos, EXP_CNT, Wait.ExpCnt, DL); + Modified |= + expandCounterSequence(MBB, InsertPos, STORE_CNT, Wait.StoreCnt, DL); + + // If we expanded anything, remove the original waitcnt + if (Modified) { + MI.eraseFromParent(); + } + + return Modified; +} + +/// Expand a single counter wait instruction (GFX12+) +bool SIInsertWaitcnts::expandSingleCounterWait(MachineInstr &MI, + MachineBasicBlock &MBB, + InstCounterType CT) { + // Get the counter value from the instruction + unsigned CountValue = MI.getOperand(0).getImm(); + + // Insert expanded waitcnts BEFORE the original instruction + auto InsertPos = MI.getIterator(); + DebugLoc DL = MI.getDebugLoc(); + + bool Modified = expandCounterSequence(MBB, InsertPos, CT, CountValue, DL); + + // If we expanded, remove the original instruction + if (Modified) { + MI.eraseFromParent(); + } + + return Modified; +} + +/// Insert a sequence of wait instructions with decreasing counter values +bool SIInsertWaitcnts::expandCounterSequence( + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPos, + InstCounterType CT, unsigned CountValue, DebugLoc DL) { + // Skip if counter is already at zero, not active, or at max (wait not needed) + if (CountValue == 0 || CountValue == ~0u) + return false; + + unsigned MaxCount = getWaitCountMax(CT); + if (CountValue >= MaxCount) + return false; + + bool Modified = false; + + // Generate decreasing sequence: CountValue-1, CountValue-2, ..., 1, 0 + // We start from CountValue-1 because the original waitcnt already handles + // CountValue + for (int i = CountValue - 1; i >= 0; --i) { + if (ST->hasExtendedWaitCounts()) { + // GFX12+: Use separate wait instructions + unsigned Opcode = instrsForExtendedCounterTypes[CT]; + BuildMI(MBB, InsertPos, DL, TII->get(Opcode)).addImm(i); + } else { + // Pre-GFX12: Use combined S_WAITCNT with only this counter set + AMDGPU::Waitcnt Wait; + switch (CT) { + case LOAD_CNT: + Wait.LoadCnt = i; + break; + case DS_CNT: + Wait.DsCnt = i; + break; + case EXP_CNT: + Wait.ExpCnt = i; + break; + case STORE_CNT: + Wait.StoreCnt = i; + break; + case SAMPLE_CNT: + Wait.SampleCnt = i; + break; + case BVH_CNT: + Wait.BvhCnt = i; + break; + case KM_CNT: + Wait.KmCnt = i; + break; + case X_CNT: + Wait.XCnt = i; + break; + default: + break; + } + + AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU()); + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + BuildMI(MBB, InsertPos, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + } + Modified = true; + } + + return Modified; +} + bool SIInsertWaitcnts::run(MachineFunction &MF) { ST = &MF.getSubtarget<GCNSubtarget>(); TII = ST->getInstrInfo(); @@ -2963,5 +3123,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { PreheadersToFlush.clear(); SLoadAddresses.clear(); + // Expand waitcnts for profiling if requested + if (ST->isExpandWaitcntProfilingEnabled()) { + Modified |= expandWaitcntsForProfiling(MF); + } + return Modified; } diff --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll new file mode 100644 index 0000000000000..cc99c457677ad --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll @@ -0,0 +1,239 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+expand-waitcnt-profiling -verify-machineinstrs < %s | FileCheck --check-prefix=EXPAND %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-expand-waitcnt-profiling -verify-machineinstrs < %s | FileCheck --check-prefix=NOEXPAND %s + +; NOTE: These simple test cases are optimized to generate waitcnt(0) by the +; time values are needed. The expansion feature correctly does NOT expand waitcnt(0). + +; Pattern: Multiple scalar loads that increment lgkmcnt, followed by use +; Expected on real kernels with non-zero lgkmcnt: +; WITHOUT expansion: s_waitcnt lgkmcnt(0) +; WITH expansion: s_waitcnt lgkmcnt(2) +; s_waitcnt lgkmcnt(1) +; s_waitcnt lgkmcnt(0) + +define amdgpu_kernel void @case1_single_counter_lgkmcnt( +; EXPAND-LABEL: case1_single_counter_lgkmcnt: +; EXPAND: ; %bb.0: +; EXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; EXPAND-NEXT: v_mov_b32_e32 v0, 0 +; EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; EXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 +; EXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 +; EXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 +; EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; EXPAND-NEXT: s_add_i32 s0, s0, s1 +; EXPAND-NEXT: s_add_i32 s0, s0, s2 +; EXPAND-NEXT: v_mov_b32_e32 v1, s0 +; EXPAND-NEXT: global_store_dword v0, v1, s[14:15] +; EXPAND-NEXT: s_endpgm +; +; NOEXPAND-LABEL: case1_single_counter_lgkmcnt: +; NOEXPAND: ; %bb.0: +; NOEXPAND-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 +; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; NOEXPAND-NEXT: s_load_dword s0, s[8:9], 0x0 +; NOEXPAND-NEXT: s_load_dword s1, s[10:11], 0x0 +; NOEXPAND-NEXT: s_load_dword s2, s[12:13], 0x0 +; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; NOEXPAND-NEXT: s_add_i32 s0, s0, s1 +; NOEXPAND-NEXT: s_add_i32 s0, s0, s2 +; NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 +; NOEXPAND-NEXT: global_store_dword v0, v1, s[14:15] +; NOEXPAND-NEXT: s_endpgm + ptr addrspace(4) %ptr_a, + ptr addrspace(4) %ptr_b, + ptr addrspace(4) %ptr_c, + ptr addrspace(1) %out) { + ; Three scalar loads - increment lgkmcnt + %val_a = load i32, ptr addrspace(4) %ptr_a, align 4 + %val_b = load i32, ptr addrspace(4) %ptr_b, align 4 + %val_c = load i32, ptr addrspace(4) %ptr_c, align 4 + + ; Use all three values + %sum1 = add i32 %val_a, %val_b + %sum2 = add i32 %sum1, %val_c + + store i32 %sum2, ptr addrspace(1) %out, align 4 + ret void +} + +; Pattern: Global load (vmcnt) and scalar load (lgkmcnt) can be separated +; Expected on real kernels with non-zero counters: +; WITHOUT expansion: s_waitcnt vmcnt(0) lgkmcnt(0) +; WITH expansion: s_waitcnt vmcnt(0) +; s_waitcnt lgkmcnt(0) + +define amdgpu_kernel void @case2_independent_counters( +; EXPAND-LABEL: case2_independent_counters: +; EXPAND: ; %bb.0: +; EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; EXPAND-NEXT: v_mov_b32_e32 v0, 0 +; EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; EXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 +; EXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 +; EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; EXPAND-NEXT: s_add_i32 s0, s4, s5 +; EXPAND-NEXT: v_mov_b32_e32 v1, s0 +; EXPAND-NEXT: global_store_dword v0, v1, s[6:7] +; EXPAND-NEXT: s_endpgm +; +; NOEXPAND-LABEL: case2_independent_counters: +; NOEXPAND: ; %bb.0: +; NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 +; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; NOEXPAND-NEXT: s_load_dword s4, s[0:1], 0x0 +; NOEXPAND-NEXT: s_load_dword s5, s[2:3], 0x0 +; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; NOEXPAND-NEXT: s_add_i32 s0, s4, s5 +; NOEXPAND-NEXT: v_mov_b32_e32 v1, s0 +; NOEXPAND-NEXT: global_store_dword v0, v1, s[6:7] +; NOEXPAND-NEXT: s_endpgm + ptr addrspace(1) %global_ptr, + ptr addrspace(4) %scalar_ptr, + ptr addrspace(1) %out) { + ; Global memory load - increments vmcnt + %global_val = load i32, ptr addrspace(1) %global_ptr, align 4 + + ; Scalar memory load - increments lgkmcnt + %scalar_val = load i32, ptr addrspace(4) %scalar_ptr, align 4 + + ; Use both values - compiler must wait for both counters + %result = add i32 %global_val, %scalar_val + + store i32 %result, ptr addrspace(1) %out, align 4 + ret void +} + +; Pattern: Multiple buffer stores followed by a load (all affect vmcnt) +; Expected on real kernels with many stores (e.g., 12 stores): +; WITHOUT expansion: s_waitcnt vmcnt(0) +; WITH expansion: s_waitcnt vmcnt(11) +; s_waitcnt vmcnt(10) +; ... +; s_waitcnt vmcnt(1) +; s_waitcnt vmcnt(0) + +define amdgpu_kernel void @case3_overlapping_counters( +; EXPAND-LABEL: case3_overlapping_counters: +; EXPAND: ; %bb.0: +; EXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; EXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; EXPAND-NEXT: v_mov_b32_e32 v0, 0 +; EXPAND-NEXT: v_mov_b32_e32 v1, 1 +; EXPAND-NEXT: v_mov_b32_e32 v2, 2 +; EXPAND-NEXT: s_waitcnt lgkmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:4 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:8 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:12 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:16 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:20 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:24 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:28 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:32 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:36 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:40 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:44 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: s_add_u32 s2, s2, s6 +; EXPAND-NEXT: s_addc_u32 s3, s3, s7 +; EXPAND-NEXT: global_load_dword v1, v0, s[2:3] glc +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:48 +; EXPAND-NEXT: s_waitcnt vmcnt(0) +; EXPAND-NEXT: s_endpgm +; +; NOEXPAND-LABEL: case3_overlapping_counters: +; NOEXPAND: ; %bb.0: +; NOEXPAND-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; NOEXPAND-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; NOEXPAND-NEXT: v_mov_b32_e32 v0, 0 +; NOEXPAND-NEXT: v_mov_b32_e32 v1, 1 +; NOEXPAND-NEXT: v_mov_b32_e32 v2, 2 +; NOEXPAND-NEXT: s_waitcnt lgkmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:4 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:8 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:12 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:16 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:20 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:24 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:28 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:32 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:36 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:40 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v2, s[0:1] offset:44 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: s_add_u32 s2, s2, s6 +; NOEXPAND-NEXT: s_addc_u32 s3, s3, s7 +; NOEXPAND-NEXT: global_load_dword v1, v0, s[2:3] glc +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: global_store_dword v0, v1, s[0:1] offset:48 +; NOEXPAND-NEXT: s_waitcnt vmcnt(0) +; NOEXPAND-NEXT: s_endpgm + ptr addrspace(1) %buf, + ptr addrspace(1) %data, + i64 %offset) { + ; Issue 12 stores to buffer - each increments vmcnt + %ptr0 = getelementptr i32, ptr addrspace(1) %buf, i64 0 + store volatile i32 1, ptr addrspace(1) %ptr0, align 4 + %ptr1 = getelementptr i32, ptr addrspace(1) %buf, i64 1 + store volatile i32 2, ptr addrspace(1) %ptr1, align 4 + %ptr2 = getelementptr i32, ptr addrspace(1) %buf, i64 2 + store volatile i32 1, ptr addrspace(1) %ptr2, align 4 + %ptr3 = getelementptr i32, ptr addrspace(1) %buf, i64 3 + store volatile i32 2, ptr addrspace(1) %ptr3, align 4 + %ptr4 = getelementptr i32, ptr addrspace(1) %buf, i64 4 + store volatile i32 1, ptr addrspace(1) %ptr4, align 4 + %ptr5 = getelementptr i32, ptr addrspace(1) %buf, i64 5 + store volatile i32 2, ptr addrspace(1) %ptr5, align 4 + %ptr6 = getelementptr i32, ptr addrspace(1) %buf, i64 6 + store volatile i32 1, ptr addrspace(1) %ptr6, align 4 + %ptr7 = getelementptr i32, ptr addrspace(1) %buf, i64 7 + store volatile i32 2, ptr addrspace(1) %ptr7, align 4 + %ptr8 = getelementptr i32, ptr addrspace(1) %buf, i64 8 + store volatile i32 1, ptr addrspace(1) %ptr8, align 4 + %ptr9 = getelementptr i32, ptr addrspace(1) %buf, i64 9 + store volatile i32 2, ptr addrspace(1) %ptr9, align 4 + %ptr10 = getelementptr i32, ptr addrspace(1) %buf, i64 10 + store volatile i32 1, ptr addrspace(1) %ptr10, align 4 + %ptr11 = getelementptr i32, ptr addrspace(1) %buf, i64 11 + store volatile i32 2, ptr addrspace(1) %ptr11, align 4 + + ; Load from potentially aliasing address - also increments vmcnt + %data_ptr = getelementptr i8, ptr addrspace(1) %data, i64 %offset + %loaded = load volatile i32, ptr addrspace(1) %data_ptr, align 4 + + ; Store the loaded value + %ptr12 = getelementptr i32, ptr addrspace(1) %buf, i64 12 + store volatile i32 %loaded, ptr addrspace(1) %ptr12, align 4 + + ret void +} >From 532c866dcc8079489eb60fff37d2c72cc6310687 Mon Sep 17 00:00:00 2001 From: Pankaj kumar divedi <[email protected]> Date: Mon, 24 Nov 2025 23:19:22 +0530 Subject: [PATCH 2/3] address review: remove subtarget integration --- clang/include/clang/Driver/Options.td | 5 +---- clang/lib/Driver/ToolChains/AMDGPU.cpp | 4 ---- clang/test/Driver/amdgpu-features.c | 6 ------ llvm/lib/Target/AMDGPU/AMDGPU.td | 4 ---- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 5 ----- 5 files changed, 1 insertion(+), 23 deletions(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index c0ba716484b6a..11e81e032d5fc 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -5497,10 +5497,7 @@ defm wavefrontsize64 : SimpleMFlag<"wavefrontsize64", " mode (AMDGPU only)">; defm amdgpu_precise_memory_op : SimpleMFlag<"amdgpu-precise-memory-op", "Enable", "Disable", - " precise memory mode (AMDGPU only)", m_amdgpu_Features_Group>; -defm amdgpu_expand_waitcnt_profiling - : SimpleMFlag<"amdgpu-expand-waitcnt-profiling", "Enable", "Disable", - " waitcnt expansion for profiling (AMDGPU only)", m_amdgpu_Features_Group>; + " precise memory mode (AMDGPU only)">; def munsafe_fp_atomics : Flag<["-"], "munsafe-fp-atomics">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, Alias<fatomic_ignore_denormal_mode>; diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index f4ddb48c9abc6..1a243fef9532d 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -700,10 +700,6 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D, options::OPT_mno_amdgpu_precise_memory_op, false)) Features.push_back("+precise-memory"); - if (Args.hasFlag(options::OPT_mamdgpu_expand_waitcnt_profiling, - options::OPT_mno_amdgpu_expand_waitcnt_profiling, false)) - Features.push_back("+expand-waitcnt-profiling"); - handleTargetFeaturesGroup(D, Triple, Args, Features, options::OPT_m_amdgpu_Features_Group); } diff --git a/clang/test/Driver/amdgpu-features.c b/clang/test/Driver/amdgpu-features.c index 16b3f4121ab7a..864744db203e9 100644 --- a/clang/test/Driver/amdgpu-features.c +++ b/clang/test/Driver/amdgpu-features.c @@ -38,9 +38,3 @@ // RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-amdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=NO-PREC-MEM %s // NO-PREC-MEM-NOT: {{".*precise-memory"}} - -// RUN: %clang -### -target amdgcn -mcpu=gfx900 -mamdgpu-expand-waitcnt-profiling %s 2>&1 | FileCheck --check-prefix=EXPAND-WAITCNT %s -// EXPAND-WAITCNT: "-target-feature" "+expand-waitcnt-profiling" - -// RUN: %clang -### -target amdgcn -mcpu=gfx900 -mno-amdgpu-expand-waitcnt-profiling %s 2>&1 | FileCheck --check-prefix=NO-EXPAND-WAITCNT %s -// NO-EXPAND-WAITCNT-NOT: "{{[+]}}expand-waitcnt-profiling" diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 3f9166f48ea22..54d94b1f8682e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -223,10 +223,6 @@ def FeaturePreciseMemory : SubtargetFeature<"precise-memory", "EnablePreciseMemory", "true", "Enable precise memory mode">; -def FeatureExpandWaitcntProfiling - : SubtargetFeature<"expand-waitcnt-profiling", "EnableExpandWaitcntProfiling", - "true", "Expand waitcnt instructions for profiling">; - def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", "SGPRInitBug", "true", diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index f2b885a790f41..f377b8aaf1333 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -90,7 +90,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool EnableCuMode = false; bool TrapHandler = false; bool EnablePreciseMemory = false; - bool EnableExpandWaitcntProfiling = false; // Used as options. bool EnableLoadStoreOpt = false; @@ -675,10 +674,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; } - bool isExpandWaitcntProfilingEnabled() const { - return EnableExpandWaitcntProfiling; - } - bool hasFlatAddressSpace() const { return FlatAddressSpace; } >From 5242cfc9b73f989f534c95db59a09b6c22448945 Mon Sep 17 00:00:00 2001 From: Pankaj kumar divedi <[email protected]> Date: Tue, 25 Nov 2025 00:07:26 +0530 Subject: [PATCH 3/3] address review --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 314 ++++++++---------- .../AMDGPU/expand-waitcnt-profiling.ll | 30 +- 2 files changed, 149 insertions(+), 195 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 4a70479358bad..b81554caf9dd1 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -63,6 +63,11 @@ static cl::opt<bool> ForceEmitZeroLoadFlag( cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden); +static cl::opt<bool> ExpandWaitcntProfiling( + "amdgpu-expand-waitcnt-profiling", + cl::desc("Expand s_waitcnt instructions for profiling"), cl::init(false), + cl::Hidden); + namespace { // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether @@ -297,6 +302,30 @@ class WaitcntGenerator { // optimization. bool isOptNone() const { return OptNone; } + // Get the maximum wait count value for a given counter type + unsigned getWaitCountMax(InstCounterType T) const { + switch (T) { + case LOAD_CNT: + return AMDGPU::getLoadcntBitMask(IV); + case DS_CNT: + return AMDGPU::getDscntBitMask(IV); + case EXP_CNT: + return AMDGPU::getExpcntBitMask(IV); + case STORE_CNT: + return AMDGPU::getStorecntBitMask(IV); + case SAMPLE_CNT: + return AMDGPU::getSamplecntBitMask(IV); + case BVH_CNT: + return AMDGPU::getBvhcntBitMask(IV); + case KM_CNT: + return AMDGPU::getKmcntBitMask(IV); + case X_CNT: + return 0; // No hardware limit for XCNT + default: + return 0; + } + } + // Edits an existing sequence of wait count instructions according // to an incoming Waitcnt value, which is itself updated to reflect // any new wait count instructions which may need to be generated by @@ -318,9 +347,11 @@ class WaitcntGenerator { // Generates new wait count instructions according to the value of // Wait, returning true if any new instructions were created. + // If ScoreBrackets is provided, it can be used for profiling expansion. virtual bool createNewWaitcnt(MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) = 0; + AMDGPU::Waitcnt Wait, + WaitcntBrackets *ScoreBrackets = nullptr) = 0; // Returns an array of bit masks which can be used to map values in // WaitEventType to corresponding counter values in InstCounterType. @@ -356,7 +387,8 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { bool createNewWaitcnt(MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) override; + AMDGPU::Waitcnt Wait, + WaitcntBrackets *ScoreBrackets = nullptr) override; const unsigned *getWaitEventMask() const override { assert(ST); @@ -393,7 +425,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { bool createNewWaitcnt(MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) override; + AMDGPU::Waitcnt Wait, + WaitcntBrackets *ScoreBrackets = nullptr) override; const unsigned *getWaitEventMask() const override { assert(ST); @@ -494,16 +527,6 @@ class SIInsertWaitcnts { bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; bool run(MachineFunction &MF); - // Methods for expanding waitcnt instructions for profiling - bool expandWaitcntsForProfiling(MachineFunction &MF); - bool expandSingleWaitcnt(MachineInstr &MI, MachineBasicBlock &MBB); - bool expandSingleCounterWait(MachineInstr &MI, MachineBasicBlock &MBB, - InstCounterType CT); - bool expandCounterSequence(MachineBasicBlock &MBB, - MachineBasicBlock::iterator InsertPos, - InstCounterType CT, unsigned CountValue, - DebugLoc DL); - void setForceEmitWaitcnt() { // For non-debug builds, ForceEmitWaitcnt has been initialized to false; // For debug builds, get the debug counter info and adjust if need be @@ -1533,7 +1556,7 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( /// required counters in \p Wait bool WaitcntGeneratorPreGFX12::createNewWaitcnt( MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) { + AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) { assert(ST); assert(isNormalMode(MaxCounter)); @@ -1543,28 +1566,83 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt( // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a // single instruction while VScnt has its own instruction. if (Wait.hasWaitExceptStoreCnt()) { - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - [[maybe_unused]] auto SWaitInst = - BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); - Modified = true; + // If profiling expansion is enabled and we have score brackets, + // emit an expanded sequence + if (ExpandWaitcntProfiling && ScoreBrackets) { + // Emit expansion for each active counter + if (Wait.LoadCnt != ~0u) { + unsigned UB = ScoreBrackets->getScoreUB(LOAD_CNT); + unsigned LB = ScoreBrackets->getScoreLB(LOAD_CNT); + unsigned Outstanding = std::min(UB - LB, getWaitCountMax(LOAD_CNT) - 1); + for (unsigned i = Outstanding; i >= Wait.LoadCnt && i != ~0u; --i) { + AMDGPU::Waitcnt ExpandWait; + ExpandWait.LoadCnt = i; + unsigned Enc = AMDGPU::encodeWaitcnt(IV, ExpandWait); + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + Modified = true; + } + } + if (Wait.DsCnt != ~0u) { + unsigned UB = ScoreBrackets->getScoreUB(DS_CNT); + unsigned LB = ScoreBrackets->getScoreLB(DS_CNT); + unsigned Outstanding = std::min(UB - LB, getWaitCountMax(DS_CNT) - 1); + for (unsigned i = Outstanding; i >= Wait.DsCnt && i != ~0u; --i) { + AMDGPU::Waitcnt ExpandWait; + ExpandWait.DsCnt = i; + unsigned Enc = AMDGPU::encodeWaitcnt(IV, ExpandWait); + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + Modified = true; + } + } + if (Wait.ExpCnt != ~0u) { + unsigned UB = ScoreBrackets->getScoreUB(EXP_CNT); + unsigned LB = ScoreBrackets->getScoreLB(EXP_CNT); + unsigned Outstanding = std::min(UB - LB, getWaitCountMax(EXP_CNT) - 1); + for (unsigned i = Outstanding; i >= Wait.ExpCnt && i != ~0u; --i) { + AMDGPU::Waitcnt ExpandWait; + ExpandWait.ExpCnt = i; + unsigned Enc = AMDGPU::encodeWaitcnt(IV, ExpandWait); + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + Modified = true; + } + } + } else { + // Normal behavior: emit single combined waitcnt + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + [[maybe_unused]] auto SWaitInst = + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + Modified = true; - LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; - if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; - dbgs() << "New Instr: " << *SWaitInst << '\n'); + LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); + } } if (Wait.hasWaitStoreCnt()) { assert(ST->hasVscnt()); - [[maybe_unused]] auto SWaitInst = + if (ExpandWaitcntProfiling && ScoreBrackets && Wait.StoreCnt != ~0u) { + unsigned UB = ScoreBrackets->getScoreUB(STORE_CNT); + unsigned LB = ScoreBrackets->getScoreLB(STORE_CNT); + unsigned Outstanding = std::min(UB - LB, getWaitCountMax(STORE_CNT) - 1); + for (unsigned i = Outstanding; i >= Wait.StoreCnt && i != ~0u; --i) { BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(Wait.StoreCnt); - Modified = true; + .addImm(i); + Modified = true; + } + } else { + [[maybe_unused]] auto SWaitInst = + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(Wait.StoreCnt); + Modified = true; - LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; - if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; - dbgs() << "New Instr: " << *SWaitInst << '\n'); + LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); + } } return Modified; @@ -1787,13 +1865,36 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, - AMDGPU::Waitcnt Wait) { + AMDGPU::Waitcnt Wait, WaitcntBrackets *ScoreBrackets) { assert(ST); assert(!isNormalMode(MaxCounter)); bool Modified = false; const DebugLoc &DL = Block.findDebugLoc(It); + // For GFX12+, we use separate wait instructions, which makes expansion + // simpler + if (ExpandWaitcntProfiling && ScoreBrackets) { + // Emit expanded sequence for each active counter + for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { + unsigned Count = getWait(Wait, CT); + if (Count == ~0u) + continue; + + unsigned UB = ScoreBrackets->getScoreUB(CT); + unsigned LB = ScoreBrackets->getScoreLB(CT); + unsigned Outstanding = std::min(UB - LB, getWaitCountMax(CT) - 1); + + for (unsigned i = Outstanding; i >= Count && i != ~0u; --i) { + BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT])) + .addImm(i); + Modified = true; + } + } + return Modified; + } + + // Normal behavior (no expansion) // Check for opportunities to use combined wait instructions. if (Wait.DsCnt != ~0u) { MachineInstr *SWaitInst = nullptr; @@ -2185,7 +2286,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, Wait.XCnt = ~0u; } - if (WCG->createNewWaitcnt(Block, It, Wait)) + if (WCG->createNewWaitcnt(Block, It, Wait, &ScoreBrackets)) Modified = true; return Modified; @@ -2735,156 +2836,6 @@ SIInsertWaitcntsPass::run(MachineFunction &MF, .preserve<AAManager>(); } -/// Expand waitcnt instructions for profiling by inserting a sequence of -/// decreasing counter values. This helps identify which specific memory -/// operation is a bottleneck during PC sampling. -bool SIInsertWaitcnts::expandWaitcntsForProfiling(MachineFunction &MF) { - if (!ST->isExpandWaitcntProfilingEnabled()) - return false; - - bool Modified = false; - - // Iterate through all basic blocks - for (MachineBasicBlock &MBB : MF) { - for (auto I = MBB.begin(), E = MBB.end(); I != E;) { - MachineInstr &MI = *I; - ++I; // Advance iterator before potential expansion - - if (ST->hasExtendedWaitCounts()) { - // GFX12+: Handle separate wait instructions - if (auto CT = counterTypeForInstr(MI.getOpcode())) { - Modified |= expandSingleCounterWait(MI, MBB, *CT); - } - } else { - // Pre-GFX12: Handle combined S_WAITCNT - if (MI.getOpcode() == AMDGPU::S_WAITCNT) { - Modified |= expandSingleWaitcnt(MI, MBB); - } - } - } - } - - return Modified; -} - -/// Expand a single S_WAITCNT instruction (pre-GFX12) -bool SIInsertWaitcnts::expandSingleWaitcnt(MachineInstr &MI, - MachineBasicBlock &MBB) { - assert(MI.getOpcode() == AMDGPU::S_WAITCNT); - - // Decode the waitcnt immediate - unsigned Imm = MI.getOperand(0).getImm(); - AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU()); - AMDGPU::Waitcnt Wait = AMDGPU::decodeWaitcnt(IV, Imm); - - // Insert expanded waitcnts BEFORE the original instruction - auto InsertPos = MI.getIterator(); - DebugLoc DL = MI.getDebugLoc(); - - bool Modified = false; - - // Expand each counter independently - // For independent counters (Case 2 from requirements): - // vmcnt and lgkmcnt can be separated - Modified |= expandCounterSequence(MBB, InsertPos, LOAD_CNT, Wait.LoadCnt, DL); - Modified |= expandCounterSequence(MBB, InsertPos, DS_CNT, Wait.DsCnt, DL); - Modified |= expandCounterSequence(MBB, InsertPos, EXP_CNT, Wait.ExpCnt, DL); - Modified |= - expandCounterSequence(MBB, InsertPos, STORE_CNT, Wait.StoreCnt, DL); - - // If we expanded anything, remove the original waitcnt - if (Modified) { - MI.eraseFromParent(); - } - - return Modified; -} - -/// Expand a single counter wait instruction (GFX12+) -bool SIInsertWaitcnts::expandSingleCounterWait(MachineInstr &MI, - MachineBasicBlock &MBB, - InstCounterType CT) { - // Get the counter value from the instruction - unsigned CountValue = MI.getOperand(0).getImm(); - - // Insert expanded waitcnts BEFORE the original instruction - auto InsertPos = MI.getIterator(); - DebugLoc DL = MI.getDebugLoc(); - - bool Modified = expandCounterSequence(MBB, InsertPos, CT, CountValue, DL); - - // If we expanded, remove the original instruction - if (Modified) { - MI.eraseFromParent(); - } - - return Modified; -} - -/// Insert a sequence of wait instructions with decreasing counter values -bool SIInsertWaitcnts::expandCounterSequence( - MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPos, - InstCounterType CT, unsigned CountValue, DebugLoc DL) { - // Skip if counter is already at zero, not active, or at max (wait not needed) - if (CountValue == 0 || CountValue == ~0u) - return false; - - unsigned MaxCount = getWaitCountMax(CT); - if (CountValue >= MaxCount) - return false; - - bool Modified = false; - - // Generate decreasing sequence: CountValue-1, CountValue-2, ..., 1, 0 - // We start from CountValue-1 because the original waitcnt already handles - // CountValue - for (int i = CountValue - 1; i >= 0; --i) { - if (ST->hasExtendedWaitCounts()) { - // GFX12+: Use separate wait instructions - unsigned Opcode = instrsForExtendedCounterTypes[CT]; - BuildMI(MBB, InsertPos, DL, TII->get(Opcode)).addImm(i); - } else { - // Pre-GFX12: Use combined S_WAITCNT with only this counter set - AMDGPU::Waitcnt Wait; - switch (CT) { - case LOAD_CNT: - Wait.LoadCnt = i; - break; - case DS_CNT: - Wait.DsCnt = i; - break; - case EXP_CNT: - Wait.ExpCnt = i; - break; - case STORE_CNT: - Wait.StoreCnt = i; - break; - case SAMPLE_CNT: - Wait.SampleCnt = i; - break; - case BVH_CNT: - Wait.BvhCnt = i; - break; - case KM_CNT: - Wait.KmCnt = i; - break; - case X_CNT: - Wait.XCnt = i; - break; - default: - break; - } - - AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU()); - unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - BuildMI(MBB, InsertPos, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); - } - Modified = true; - } - - return Modified; -} - bool SIInsertWaitcnts::run(MachineFunction &MF) { ST = &MF.getSubtarget<GCNSubtarget>(); TII = ST->getInstrInfo(); @@ -3123,10 +3074,5 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { PreheadersToFlush.clear(); SLoadAddresses.clear(); - // Expand waitcnts for profiling if requested - if (ST->isExpandWaitcntProfilingEnabled()) { - Modified |= expandWaitcntsForProfiling(MF); - } - return Modified; } diff --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll index cc99c457677ad..b5583cfe2dc3b 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll @@ -1,16 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+expand-waitcnt-profiling -verify-machineinstrs < %s | FileCheck --check-prefix=EXPAND %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-expand-waitcnt-profiling -verify-machineinstrs < %s | FileCheck --check-prefix=NOEXPAND %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-expand-waitcnt-profiling -verify-machineinstrs < %s | FileCheck --check-prefix=EXPAND %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=NOEXPAND %s -; NOTE: These simple test cases are optimized to generate waitcnt(0) by the -; time values are needed. The expansion feature correctly does NOT expand waitcnt(0). - -; Pattern: Multiple scalar loads that increment lgkmcnt, followed by use -; Expected on real kernels with non-zero lgkmcnt: -; WITHOUT expansion: s_waitcnt lgkmcnt(0) -; WITH expansion: s_waitcnt lgkmcnt(2) -; s_waitcnt lgkmcnt(1) -; s_waitcnt lgkmcnt(0) +; This test demonstrates the waitcnt expansion feature for PC-sampling profiling. +; The expansion transforms a single waitcnt instruction into a sequence of waitcnts +; with decreasing counter values to help identify which specific memory operation +; is causing a bottleneck. +; +; NOTE: These simple test cases are optimized such that by the time a wait is needed, +; all outstanding operations have already been issued and can be waited on with a +; single waitcnt. In this case, there are no outstanding operations at the wait point +; (upper bound = target value), so no expansion occurs. This is correct behavior. +; +; In real-world kernels with complex control flow, there will be outstanding operations +; when waits are inserted. For example, if 5 memory operations are outstanding and we +; need to wait for 2 to complete, the expansion will generate: +; s_waitcnt lgkmcnt(4) ; wait for 1st op +; s_waitcnt lgkmcnt(3) ; wait for 2nd op +; s_waitcnt lgkmcnt(2) ; target reached +; This allows PC-sampling to identify which specific operation is slow. define amdgpu_kernel void @case1_single_counter_lgkmcnt( ; EXPAND-LABEL: case1_single_counter_lgkmcnt: _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
