llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-transforms Author: Pierre van Houtryve (Pierre-vh) <details> <summary>Changes</summary> --- Patch is 1.02 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154710.diff 9 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+46-4) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+2) - (modified) llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll (-12) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll (+3235-504) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll (+2892-540) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll (+3131-475) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll (+2892-540) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll (+2938-540) - (added) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-private-gas.ll (+172) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 561019bb65549..60faf211df0d9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17808,11 +17808,19 @@ static bool flatInstrMayAccessPrivate(const Instruction *I) { !AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS); } +static TargetLowering::AtomicExpansionKind +getPrivateAtomicExpansionKind(const GCNSubtarget &STI) { + // For GAS, lower to flat atomic. + return STI.hasGloballyAddressableScratch() + ? TargetLowering::AtomicExpansionKind::Expand + : TargetLowering::AtomicExpansionKind::NotAtomic; +} + TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { unsigned AS = RMW->getPointerAddressSpace(); if (AS == AMDGPUAS::PRIVATE_ADDRESS) - return AtomicExpansionKind::NotAtomic; + return getPrivateAtomicExpansionKind(*getSubtarget()); // 64-bit flat atomics that dynamically reside in private memory will silently // be dropped. @@ -18038,14 +18046,14 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS - ? AtomicExpansionKind::NotAtomic + ? getPrivateAtomicExpansionKind(*getSubtarget()) : AtomicExpansionKind::None; } TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS - ? AtomicExpansionKind::NotAtomic + ? getPrivateAtomicExpansionKind(*getSubtarget()) : AtomicExpansionKind::None; } @@ -18053,7 +18061,7 @@ TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const { unsigned AddrSpace = CmpX->getPointerAddressSpace(); if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) - return AtomicExpansionKind::NotAtomic; + return getPrivateAtomicExpansionKind(*getSubtarget()); if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX)) return AtomicExpansionKind::None; @@ -18423,9 +18431,24 @@ void SITargetLowering::emitExpandAtomicAddrSpacePredicate( Builder.CreateBr(ExitBB); } +static void convertScratchAtomicToFlatAtomic(Instruction *I, + unsigned PtrOpIdx) { + Value *PtrOp = I->getOperand(PtrOpIdx); + assert(PtrOp->getType()->getPointerAddressSpace() == + AMDGPUAS::PRIVATE_ADDRESS); + + Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS); + Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast", + I->getIterator()); + I->setOperand(PtrOpIdx, ASCast); +} + void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { AtomicRMWInst::BinOp Op = AI->getOperation(); + if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) + return convertScratchAtomicToFlatAtomic(AI, AI->getPointerOperandIndex()); + if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor) { if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand()); @@ -18448,9 +18471,28 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { } void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const { + if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) + return convertScratchAtomicToFlatAtomic(CI, CI->getPointerOperandIndex()); + emitExpandAtomicAddrSpacePredicate(CI); } +void SITargetLowering::emitExpandAtomicLoad(LoadInst *LI) const { + if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) + return convertScratchAtomicToFlatAtomic(LI, LI->getPointerOperandIndex()); + + llvm_unreachable( + "Expand Atomic Load only handles SCRATCH -> FLAT conversion"); +} + +void SITargetLowering::emitExpandAtomicStore(StoreInst *SI) const { + if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) + return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex()); + + llvm_unreachable( + "Expand Atomic Store only handles SCRATCH -> FLAT conversion"); +} + LoadInst * SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { IRBuilder<> Builder(AI); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index dedd9ae170774..e96b702367299 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -562,6 +562,8 @@ class SITargetLowering final : public AMDGPUTargetLowering { void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const; void emitExpandAtomicRMW(AtomicRMWInst *AI) const override; void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override; + void emitExpandAtomicLoad(LoadInst *LI) const override; + void emitExpandAtomicStore(StoreInst *SI) const override; LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll index d13d76fcfabf4..fcdba69c30213 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll @@ -86,15 +86,3 @@ entry: store atomic i32 %val, ptr addrspace(3) %dst syncscope("wavefront") unordered, align 4 ret void } - -; GCN: scratch_atomic_store: -; CU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE -; NOCU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE -; GCN: .amdhsa_kernel scratch_atomic_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @scratch_atomic_store(ptr addrspace(5) %dst, i32 %val) { -entry: - store atomic i32 %val, ptr addrspace(5) %dst syncscope("wavefront") unordered, align 4 - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll index af5b529fc387e..fe345f9244066 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll @@ -179,11 +179,35 @@ define amdgpu_kernel void @private_agent_unordered_load( ; ; GFX1250-LABEL: private_agent_unordered_load: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: ; implicit-def: $sgpr1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: scratch_load_b32 v0, off, s1 -; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; implicit-def: $sgpr1 +; GFX1250-NEXT: ; implicit-def: $sgpr1 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -358,11 +382,35 @@ define amdgpu_kernel void @private_agent_monotonic_load( ; ; GFX1250-LABEL: private_agent_monotonic_load: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: ; implicit-def: $sgpr1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: scratch_load_b32 v0, off, s1 -; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; implicit-def: $sgpr1 +; GFX1250-NEXT: ; implicit-def: $sgpr1 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -537,11 +585,36 @@ define amdgpu_kernel void @private_agent_acquire_load( ; ; GFX1250-LABEL: private_agent_acquire_load: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: ; implicit-def: $sgpr1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: scratch_load_b32 v0, off, s1 -; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; implicit-def: $sgpr1 +; GFX1250-NEXT: ; implicit-def: $sgpr1 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -716,11 +789,42 @@ define amdgpu_kernel void @private_agent_seq_cst_load( ; ; GFX1250-LABEL: private_agent_seq_cst_load: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: ; implicit-def: $sgpr1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: scratch_load_b32 v0, off, s1 -; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; implicit-def: $sgpr1 +; GFX1250-NEXT: ; implicit-def: $sgpr1 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(5) %out) { @@ -875,11 +979,35 @@ define amdgpu_kernel void @private_agent_unordered_store( ; ; GFX1250-LABEL: private_agent_unordered_store: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: ; implicit-def: $sgpr1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; implicit-def: $sgpr1 +; GFX1250-NEXT: ; implicit-def: $sgpr1 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { entry: @@ -1032,11 +1160,35 @@ define amdgpu_kernel void @private_agent_monotonic_store( ; ; GFX1250-LABEL: private_agent_monotonic_store: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: ; implicit-def: $sgpr1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; implicit-def: $sgpr1 +; GFX1250-NEXT: ; implicit-def: $sgpr1 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { entry: @@ -1189,11 +1341,39 @@ define amdgpu_kernel void @private_agent_release_store( ; ; GFX1250-LABEL: private_agent_release_store: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: ; implicit-def: $sgpr1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; implicit-def: $sgpr1 +; GFX1250-NEXT: ; implicit-def: $sgpr1 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { entry: @@ -1346,11 +1526,39 @@ define amdgpu_kernel void @private_agent_seq_cst_store( ; ; GFX1250-LABEL: private_agent_seq_cst_store: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT:... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/154710 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits