llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> Start considering !amdgpu.no.remote.memory.access and !amdgpu.no.fine.grained.host.memory metadata when deciding to expand integer atomic operations. This does not yet attempt to accurately handle fadd/fmin/fmax, which are trickier and require migrating the old "amdgpu-unsafe-fp-atomics" attribute. --- Patch is 1.43 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/122138.diff 28 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+53-12) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll (+31-30) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll (+2521-614) - (modified) llvm/test/CodeGen/AMDGPU/acc-ldst.ll (+4-2) - (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+14-12) - (modified) llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll (+17-17) - (modified) llvm/test/CodeGen/AMDGPU/flat_atomics.ll (+87-85) - (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll (+100-926) - (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll (+82-80) - (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll (+3758-1362) - (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll (+258-1374) - (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll (+100-1212) - (modified) llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll (+82-80) - (modified) llvm/test/CodeGen/AMDGPU/global_atomics.ll (+319-79) - (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll (+98-984) - (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll (+82-80) - (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll (+109-1221) - (modified) llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll (+42-28) - (modified) llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll (+4-2) - (modified) llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll (+1-1) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll (+534-159) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll (+990-49) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll (+30-330) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll (+990-49) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll (+30-330) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll (+209-6) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll (+130-8) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll (+43-21) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 513251e398ad4d..5fa8e1532096f7 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16621,19 +16621,60 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { case AtomicRMWInst::UDecWrap: { if (AMDGPU::isFlatGlobalAddrSpace(AS) || AS == AMDGPUAS::BUFFER_FAT_POINTER) { - // Always expand system scope atomics. - if (HasSystemScope) { - if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or || - Op == AtomicRMWInst::Xor) { - // Atomic sub/or/xor do not work over PCI express, but atomic add - // does. InstCombine transforms these with 0 to or, so undo that. - if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand()); - ConstVal && ConstVal->isNullValue()) - return AtomicExpansionKind::Expand; - } - - return AtomicExpansionKind::CmpXChg; + // On most subtargets, for atomicrmw operations other than add/xchg, + // whether or not the instructions will behave correctly depends on where + // the address physically resides and what interconnect is used in the + // system configuration. On some some targets the instruction will nop, + // and in others synchronization will only occur at degraded device scope. + // + // If the allocation is known local to the device, the instructions should + // work correctly. + if (RMW->hasMetadata("amdgpu.no.remote.memory")) + return atomicSupportedIfLegalIntType(RMW); + + // If fine-grained remote memory works at device scope, we don't need to + // do anything. + if (!HasSystemScope && + Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics()) + return atomicSupportedIfLegalIntType(RMW); + + // If we are targeting a remote allocated address, it depends what kind of + // allocation the address belongs to. + // + // If the allocation is fine-grained (in host memory, or in PCIe peer + // device memory), the operation will fail depending on the target. + // + // Note fine-grained host memory access does work on APUs or if XGMI is + // used, but we do not know if we are targeting an APU or the system + // configuration from the ISA version/target-cpu. + if (RMW->hasMetadata("amdgpu.no.fine.grained.memory")) + return atomicSupportedIfLegalIntType(RMW); + + if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or || + Op == AtomicRMWInst::Xor) { + // Atomic sub/or/xor do not work over PCI express, but atomic add + // does. InstCombine transforms these with 0 to or, so undo that. + if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand()); + ConstVal && ConstVal->isNullValue()) + return AtomicExpansionKind::Expand; } + + // If the allocation could be in remote, fine-grained memory, the rmw + // instructions may fail. cmpxchg should work, so emit that. On some + // system configurations, PCIe atomics aren't supported so cmpxchg won't + // even work, so you're out of luck anyway. + + // In summary: + // + // Cases that may fail: + // - fine-grained pinned host memory + // - fine-grained migratable host memory + // - fine-grained PCIe peer device + // + // Cases that should work, but may be treated overly conservatively. + // - fine-grained host memory on an APU + // - fine-grained XGMI peer device + return AtomicExpansionKind::CmpXChg; } return atomicSupportedIfLegalIntType(RMW); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll index 35aa3cfbc841c8..11d01189375ce5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -85,7 +85,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr addrspace(1) %out, align 4 ret void } @@ -350,7 +350,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr addrspace(1) %out, align 4 ret void } @@ -427,7 +427,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr addrspace(1) %out, align 4 ret void } @@ -656,7 +656,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -723,7 +723,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -962,7 +962,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr addrspace(1) %out.gep, align 4 ret void } @@ -1040,7 +1040,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -1119,7 +1119,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr %out, align 4 ret void } @@ -1206,7 +1206,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr %out, align 4 ret void } @@ -1442,7 +1442,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -1516,7 +1516,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -1780,7 +1780,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % %gep.tid = getelementptr i32, ptr %ptr, i32 %id %out.gep = getelementptr i32, ptr %out, i32 %id %gep = getelementptr i32, ptr %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %result, ptr %out.gep, align 4 ret void } @@ -1875,7 +1875,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr %ptr, i32 %id %gep = getelementptr i32, ptr %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 ret void } @@ -1969,7 +1969,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 store i64 %result, ptr %out, align 4 ret void } @@ -2071,7 +2071,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 store i64 %result, ptr %out, align 4 ret void } @@ -2144,7 +2144,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 ret void } @@ -2223,7 +2223,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 ret void } @@ -2536,7 +2536,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % %gep.tid = getelementptr i64, ptr %ptr, i32 %id %out.gep = getelementptr i64, ptr %out, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 store i64 %result, ptr %out.gep, align 4 ret void } @@ -2635,7 +2635,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 ret void } @@ -2724,7 +2724,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0 - %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i32 9 syncscope("agent") seq_cst, align 4 + %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i32 9 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1 store i32 %idx.0, ptr addrspace(1) %add_use, align 4 store i32 %result, ptr addrspace(1) %out, align 4 ret void @@ -2807,7 +2807,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 store i64 %result, ptr addrspace(1) %out, align 4 ret void } @@ -2953,7 +2953,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 ret void } @@ -3016,7 +3016,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 - %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 ret void } @@ -3092,7 +3092,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 store i64 %result, ptr addrspace(1) %out, align 4 ret void } @@ -3174,7 +3174,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 store i64 %result, ptr addrspace(1) %out, align 4 ret void } @@ -3440,7 +3440,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 ret void } @@ -3512,7 +3512,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 ret void } @@ -3788,7 +3788,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 store i64 %result, ptr addrspace(1) %out.gep, align 4 ret void } @@ -3871,7 +3871,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1 ret void } @@ -3966,7 +3966,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0 - %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 + %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1 store i32 %idx.0, ptr addrspace(1) %add_use, align 4 store i64 %result, ptr addrspace(1) %out, align 4 ret void @@ -3977,6 +3977,7 @@ attributes #1 = { nounwind } attributes #2 = { nounwind memory(none) } !0 = !{i32 5, i32 6} +!1 = !{} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index 00d932b584aaf9..1eefad8b6cc6e5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -332,13 +332,28 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; CI-LABEL: global_atomic_inc_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b64 s[4:5], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dword s6, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s6 +; CI-NEXT: .LBB4_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; C... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/122138 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits