llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-globalisel Author: Pierre van Houtryve (Pierre-vh) <details> <summary>Changes</summary> wb/wbinv use storecnt, inv uses loadcnt. Track them as VMEM_WRITE_ACCESS and VMEM_READ_ACCESS to avoid InsertWaitCnt incorrectly eliminating the waitcnts after these instructions. Solves SWDEV-526604 --- Patch is 395.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135340.diff 32 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+13-7) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll (+16) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll (+16) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll (+9) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll (+10) - (modified) llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll (+4) - (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+35) - (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+23) - (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+23) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+80) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+62) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+62) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+58) - (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll (+8) - (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll (+8) - (modified) llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll (+5) - (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+92) - (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+62) - (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+62) - (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll (+58) - (modified) llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll (+48) - (modified) llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll (+55) - (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+4) - (modified) llvm/test/CodeGen/AMDGPU/global-saddr-store.ll (+4) - (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll (+5) - (modified) llvm/test/CodeGen/AMDGPU/insert-waitcnts-gfx12-wbinv.mir (+3) - (modified) llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll (+4) - (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll (+32) - (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll (+30) - (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll (+30) - (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll (+30) - (modified) llvm/test/CodeGen/AMDGPU/waitcnt-global-inv-wb.mir (+1-1) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 8848eebdeb6b3..5b35740ca86d4 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -698,6 +698,16 @@ class SIInsertWaitcnts { // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or // FLAT instruction. WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const { + switch (Inst.getOpcode()) { + case AMDGPU::GLOBAL_INV: + return VMEM_READ_ACCESS; // tracked using loadcnt + case AMDGPU::GLOBAL_WB: + case AMDGPU::GLOBAL_WBINV: + return VMEM_WRITE_ACCESS; // tracked using storecnt + default: + break; + } + // Maps VMEM access types to their corresponding WaitEventType. static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = { VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}; @@ -2130,15 +2140,11 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); } } else if (TII->isFLAT(Inst)) { - // TODO: Track this properly. - if (isCacheInvOrWBInst(Inst)) - return; - - assert(Inst.mayLoadOrStore()); - int FlatASCount = 0; - if (mayAccessVMEMThroughFlat(Inst)) { + assert(isCacheInvOrWBInst(Inst) || Inst.mayLoadOrStore()); + + if (isCacheInvOrWBInst(Inst) || mayAccessVMEMThroughFlat(Inst)) { ++FlatASCount; ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst), Inst); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index d1a303b41deef..666523c88860c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -22,6 +22,7 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_f32: @@ -94,6 +95,7 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: ds_max_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_f32: @@ -166,6 +168,7 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_ret_f64: @@ -242,6 +245,7 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: ds_max_num_f64 v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmax_noret_f64: @@ -318,6 +322,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -464,6 +469,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -624,6 +630,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -779,6 +786,7 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -909,6 +917,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -1051,6 +1060,7 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -1210,6 +1220,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -1363,6 +1374,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: @@ -1495,6 +1507,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -1651,6 +1664,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: @@ -1824,6 +1838,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: @@ -1994,6 +2009,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index b8538cbf254fc..351502816ae6e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -22,6 +22,7 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_f32: @@ -94,6 +95,7 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr, float %val) { ; GFX12-NEXT: ds_min_num_f32 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_f32: @@ -166,6 +168,7 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_ret_f64: @@ -242,6 +245,7 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr, double %val) { ; GFX12-NEXT: ds_min_num_f64 v0, v[1:2] ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: local_atomic_fmin_noret_f64: @@ -318,6 +322,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -464,6 +469,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -624,6 +630,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -779,6 +786,7 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX12-NEXT: s_cbranch_execnz .LBB7_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -909,6 +917,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -1051,6 +1060,7 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -1210,6 +1220,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -1363,6 +1374,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_cbranch_execnz .LBB11_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: @@ -1495,6 +1507,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -1651,6 +1664,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_ ; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: @@ -1824,6 +1838,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_cbranch_execnz .LBB14_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: @@ -1994,6 +2009,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ ; GFX12-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index 92a7de9aaefd2..2193b93742d44 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -576,6 +576,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -805,6 +806,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -1721,6 +1723,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; GFX12-NEXT: v_mov_b32_e32 v0, 42 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1968,6 +1971,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; GFX12-NEXT: v_mov_b32_e32 v0, 42 ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 @@ -2466,6 +2470,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS @@ -2718,6 +2723,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS @@ -3423,6 +3429,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 ; GFX12-NEXT: v_mov_b32_e32 v3, s3 ; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS @@ -3693,6 +3700,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS @@ -4072,6 +4080,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; GFX12-NEXT: ds_inc_rtn_u32 v2, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ds_inc_rtn_u32 v0, v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll index e88c5e78779b3..07d5ff2036d93 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -1232,6 +1232,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(ptr addrspace(1) inr ; GFX12-NEXT: global_atomic_add_u32 v0, v1, v0, s[2:3] offset:16380 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095 %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst @@ -1279,6 +1280,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace( ; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: ... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/135340 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits