Author: anjenner Date: 2025-12-09T23:13:33Z New Revision: 27651133e213a6a1eb4d0e47837625cee3613111
URL: https://github.com/llvm/llvm-project/commit/27651133e213a6a1eb4d0e47837625cee3613111 DIFF: https://github.com/llvm/llvm-project/commit/27651133e213a6a1eb4d0e47837625cee3613111.diff LOG: AMDGPU: Drop and upgrade llvm.amdgcn.atomic.csub/cond.sub to atomicrmw (#105553) These both perform conditional subtraction, returning the minuend and zero respectively, if the difference is negative. Added: Modified: llvm/docs/AMDGPUUsage.rst llvm/docs/ReleaseNotes.md llvm/include/llvm/IR/IntrinsicsAMDGPU.td llvm/lib/IR/AutoUpgrade.cpp llvm/lib/Target/AMDGPU/AMDGPUInstructions.td llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td llvm/lib/Target/AMDGPU/BUFInstructions.td llvm/lib/Target/AMDGPU/DSInstructions.td llvm/lib/Target/AMDGPU/FLATInstructions.td llvm/lib/Target/AMDGPU/SIISelLowering.cpp llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll llvm/test/Bitcode/amdgcn-atomic.ll llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll Removed: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll ################################################################################ diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 3e7a5dfc504ae..7ecf1c1124894 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1527,11 +1527,6 @@ The AMDGPU backend implements the following LLVM IR intrinsics. The iglp_opt strategy implementations are subject to change. - llvm.amdgcn.atomic.cond.sub.u32 Provides direct access to flat_atomic_cond_sub_u32, global_atomic_cond_sub_u32 - and ds_cond_sub_u32 based on address space on gfx12 targets. This - performs a subtraction only if the memory value is greater than or - equal to the data value. - llvm.amdgcn.s.barrier.signal.isfirst Provides access to the s_barrier_signal_first instruction; additionally ensures that the result value is valid even when the intrinsic is used from a wave that is not running in a workgroup. diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 8ec46c661974b..1b85145efbf4a 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -114,6 +114,10 @@ Changes to the AArch64 Backend Changes to the AMDGPU Backend ----------------------------- +* Removed `llvm.amdgcn.atomic.cond.sub.u32` and + `llvm.amdgcn.atomic.csub.u32` intrinsics. Users should use the + `atomicrmw` instruction with `usub_cond` and `usub_sat` instead. + Changes to the ARM Backend -------------------------- diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 03488f8389aa2..64d3dd6c3b701 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2888,8 +2888,6 @@ class AMDGPUAtomicRtn<LLVMType vt, LLVMType pt = llvm_anyptr_ty> : Intrinsic < [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>; -def int_amdgcn_global_atomic_csub : AMDGPUAtomicRtn<llvm_i32_ty>; - // uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>, // <ray_dir>, <ray_inv_dir>, <texture_descr> // <node_ptr> is i32 or i64. @@ -3137,8 +3135,6 @@ def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; -def int_amdgcn_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty>; - class AMDGPULoadIntrinsic<LLVMType ptr_ty>: Intrinsic< [llvm_any_ty], diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index e67f1ecd96bb1..2202b08e3cf0d 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1274,9 +1274,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, } if (Name.consume_front("atomic.")) { - if (Name.starts_with("inc") || Name.starts_with("dec")) { - // These were replaced with atomicrmw uinc_wrap and udec_wrap, so - // there's no new declaration. + if (Name.starts_with("inc") || Name.starts_with("dec") || + Name.starts_with("cond.sub") || Name.starts_with("csub")) { + // These were replaced with atomicrmw uinc_wrap, udec_wrap, usub_cond + // and usub_sat so there's no new declaration. NewFn = nullptr; return true; } @@ -4606,7 +4607,9 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI, .StartsWith("global.atomic.fmin", AtomicRMWInst::FMin) .StartsWith("flat.atomic.fmin", AtomicRMWInst::FMin) .StartsWith("global.atomic.fmax", AtomicRMWInst::FMax) - .StartsWith("flat.atomic.fmax", AtomicRMWInst::FMax); + .StartsWith("flat.atomic.fmax", AtomicRMWInst::FMax) + .StartsWith("atomic.cond.sub", AtomicRMWInst::USubCond) + .StartsWith("atomic.csub", AtomicRMWInst::USubSat); unsigned NumOperands = CI->getNumOperands(); if (NumOperands < 3) // Malformed bitcode. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index dd86cb5d3d5a6..2a99dacba52a4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -636,15 +636,11 @@ multiclass local_addr_space_atomic_op { } } -defm int_amdgcn_global_atomic_csub : noret_op; defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op; defm int_amdgcn_flat_atomic_fmin_num : noret_op; defm int_amdgcn_flat_atomic_fmax_num : noret_op; defm int_amdgcn_global_atomic_fmin_num : noret_op; defm int_amdgcn_global_atomic_fmax_num : noret_op; -defm int_amdgcn_atomic_cond_sub_u32 : local_addr_space_atomic_op; -defm int_amdgcn_atomic_cond_sub_u32 : flat_addr_space_atomic_op; -defm int_amdgcn_atomic_cond_sub_u32 : global_addr_space_atomic_op; multiclass noret_binary_atomic_op<SDNode atomic_op> { let HasNoUse = true in diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index a7955ee2dac40..ce4cc799543f7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5309,12 +5309,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } - case Intrinsic::amdgcn_global_atomic_csub: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_flat_atomic_fmax_num: - case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: case Intrinsic::amdgcn_global_load_tr_b64: case Intrinsic::amdgcn_global_load_tr_b128: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index fe452f008c95c..58a9b5511f2d0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -237,8 +237,6 @@ def : SourceOfDivergence<int_amdgcn_mbcnt_lo>; def : SourceOfDivergence<int_r600_read_tidig_x>; def : SourceOfDivergence<int_r600_read_tidig_y>; def : SourceOfDivergence<int_r600_read_tidig_z>; -def : SourceOfDivergence<int_amdgcn_atomic_cond_sub_u32>; -def : SourceOfDivergence<int_amdgcn_global_atomic_csub>; def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>; def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>; def : SourceOfDivergence<int_amdgcn_global_atomic_ordered_add_b64>; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index b97b7385dc1ff..bb0e9380e956d 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -783,37 +783,20 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName, multiclass MUBUF_Pseudo_Atomics_RTN <string opName, RegisterOperand vdataClass, - ValueType vdataType, - SDPatternOperator atomic> { + ValueType vdataType> { let FPAtomic = vdataType.isFP in { - def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0, - [(set vdataType:$vdata, - (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), - vdataType:$vdata_in))]>, - MUBUFAddr64Table <0, NAME # "_RTN">; - - def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0, - [(set vdataType:$vdata, - (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset), - vdataType:$vdata_in))]>, - MUBUFAddr64Table <1, NAME # "_RTN">; - + def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0>, + MUBUFAddr64Table <0, NAME # "_RTN">; + def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0>, + MUBUFAddr64Table <1, NAME # "_RTN">; def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, 0>; def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, 0>; def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, 0>; - def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1, - [(set vdataType:$vdata, - (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), - vdataType:$vdata_in))]>, - MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">; - - def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1, - [(set vdataType:$vdata, - (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset), - vdataType:$vdata_in))]>, - MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">; - + def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1>, + MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">; + def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1>, + MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">; def _VBUFFER_OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.OffEn, vdataClass, 1>; def _VBUFFER_IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.IdxEn, vdataClass, 1>; def _VBUFFER_BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.BothEn, vdataClass, 1>; @@ -822,10 +805,9 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName, multiclass MUBUF_Pseudo_Atomics <string opName, RegisterOperand vdataClass, - ValueType vdataType, - SDPatternOperator atomic = null_frag> : + ValueType vdataType> : MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType>, - MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType, atomic>; + MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType>; //===----------------------------------------------------------------------===// @@ -1096,7 +1078,7 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics < let OtherPredicates = [HasGFX10_BEncoding] in { defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics < - "buffer_atomic_csub", VGPROp_32, i32, int_amdgcn_global_atomic_csub + "buffer_atomic_csub", VGPROp_32, i32 >; } @@ -1117,22 +1099,22 @@ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc", let SubtargetPredicate = isGFX6GFX7GFX10Plus in { defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics < - "buffer_atomic_fcmpswap", AVLdSt_64, v2f32, null_frag + "buffer_atomic_fcmpswap", AVLdSt_64, v2f32 >; } let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in { defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics < - "buffer_atomic_fmin", AVLdSt_32, f32, null_frag + "buffer_atomic_fmin", AVLdSt_32, f32 >; defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics < - "buffer_atomic_fmax", AVLdSt_32, f32, null_frag + "buffer_atomic_fmax", AVLdSt_32, f32 >; } let SubtargetPredicate = isGFX6GFX7GFX10 in { defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics < - "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64, null_frag + "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64 >; } @@ -1201,12 +1183,12 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < let SubtargetPredicate = HasAtomicFaddRtnInsts in defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN< - "buffer_atomic_add_f32", AVLdSt_32, f32, null_frag + "buffer_atomic_add_f32", AVLdSt_32, f32 >; let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN < - "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16, null_frag + "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16 >; let SubtargetPredicate = isGFX12Plus in { diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 4b3bd0c09e076..3a53cef96473c 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -886,15 +886,6 @@ defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc_gfx9<"ds_sub_clamp_rtn_u32", VGPROp_3 def DS_BPERMUTE_FI_B32 : DS_1A1D_PERMUTE <"ds_bpermute_fi_b32", int_amdgcn_ds_bpermute_fi_b32>; -multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst, - ValueType vt, string frag> { - def : DSAtomicRetPat<inst, vt, - !cast<PatFrag>(frag#"_local_addrspace")>; - def : DSAtomicRetPat<noRetInst, vt, - !cast<PatFrag>(frag#"_noret_local_addrspace"), /* complexity */ 1>; -} - -defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">; } // let SubtargetPredicate = isGFX12Plus let SubtargetPredicate = isGFX1250Plus in { diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 0e60c73aa5db7..9e38af91c7ccf 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1562,10 +1562,6 @@ multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType } } -multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addrSpaceSuffix, - ValueType vt> : - FlatAtomicNoRtnPatBase<inst, node # "_noret_" # addrSpaceSuffix, vt, vt>; - multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> : FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt), vt, data_vt>; @@ -1590,10 +1586,6 @@ multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt, } } -multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix, - ValueType vt> : - FlatAtomicRtnPatBase<inst, intr # "_" # addrSpaceSuffix, vt, vt>; - multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> : FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt), vt, data_vt>; @@ -2189,9 +2181,6 @@ let SubtargetPredicate = HasAtomicCondSubClampFlatInsts in { defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>; defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>; -defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; -defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; - let OtherPredicates = [HasD16LoadStore] in { defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>; defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a871d978dfbc8..0f91b319b16d4 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1522,15 +1522,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; return true; } - case Intrinsic::amdgcn_global_atomic_csub: { - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getType()); - Info.ptrVal = CI.getOperand(0); - Info.align.reset(); - Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | - MachineMemOperand::MOVolatile; - return true; - } case Intrinsic::amdgcn_image_bvh_dual_intersect_ray: case Intrinsic::amdgcn_image_bvh_intersect_ray: case Intrinsic::amdgcn_image_bvh8_intersect_ray: { @@ -1551,8 +1542,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: case Intrinsic::amdgcn_flat_atomic_fmin_num: - case Intrinsic::amdgcn_flat_atomic_fmax_num: - case Intrinsic::amdgcn_atomic_cond_sub_u32: { + case Intrinsic::amdgcn_flat_atomic_fmax_num: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); @@ -1727,7 +1717,6 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, Type *&AccessTy) const { Value *Ptr = nullptr; switch (II->getIntrinsicID()) { - case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_cluster_load_b128: case Intrinsic::amdgcn_cluster_load_b64: case Intrinsic::amdgcn_cluster_load_b32: @@ -1750,7 +1739,6 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_flat_load_monitor_b128: case Intrinsic::amdgcn_flat_load_monitor_b32: case Intrinsic::amdgcn_flat_load_monitor_b64: - case Intrinsic::amdgcn_global_atomic_csub: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: @@ -18390,7 +18378,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N, case AMDGPUISD::BUFFER_ATOMIC_INC: case AMDGPUISD::BUFFER_ATOMIC_DEC: case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: - case AMDGPUISD::BUFFER_ATOMIC_CSUB: case AMDGPUISD::BUFFER_ATOMIC_FADD: case AMDGPUISD::BUFFER_ATOMIC_FMIN: case AMDGPUISD::BUFFER_ATOMIC_FMAX: diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll index 15355ea139205..d9e51c39c2042 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll @@ -15,62 +15,5 @@ define amdgpu_kernel void @test2(ptr %ptr, i32 %cmp, i32 %new) { ret void } -; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %val) -define amdgpu_kernel void @test_atomic_csub_i32(ptr addrspace(1) %ptr, i32 %val) #0 { - %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %val) - store i32 %ret, ptr addrspace(1) %ptr, align 4 - ret void -} - -; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in) -define amdgpu_kernel void @test_ds_atomic_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) #0 { -entry: - %gep = getelementptr i32, ptr addrspace(3) %addr, i32 4 - %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in) - store i32 %val, ptr addrspace(3) %use - ret void -} - -; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in) -define amdgpu_kernel void @test_flat_atomic_cond_sub_u32(ptr %addr, i32 %in, ptr %use) #0 { -entry: - %gep = getelementptr i32, ptr %addr, i32 4 - %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in) - store i32 %val, ptr %use - ret void -} - -; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in) -define amdgpu_kernel void @test_global_atomic_cond_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) #0 { -entry: - %gep = getelementptr i32, ptr addrspace(1) %addr, i32 4 - %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in) - store i32 %val, ptr addrspace(1) %use - ret void -} - -; CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) -define float @test_raw_buffer_atomic_cond_sub_u32(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -entry: - %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -; CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) -define float @test_struct_buffer_atomic_cond_sub_u32(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -entry: - %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -declare i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) nocapture, i32) #1 -declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3), i32) #1 -declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32) #1 -declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1), i32) #1 -declare i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32) #1 -declare i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32, i32) #1 - attributes #0 = { nounwind } attributes #1 = { argmemonly nounwind willreturn } diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll index 3e28cd050fc88..e9194ea1e14fe 100644 --- a/llvm/test/Bitcode/amdgcn-atomic.ll +++ b/llvm/test/Bitcode/amdgcn-atomic.ll @@ -420,5 +420,152 @@ define double @upgrade_amdgcn_global_atomic_fmax_f64_p1_f64(ptr addrspace(1) %pt attributes #0 = { argmemonly nounwind willreturn } +define void @atomic_usub_cond(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) { + ; CHECK: atomicrmw usub_cond ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4 + %result0 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false) + + ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4 + %result1 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false) + + ; CHECK: atomicrmw usub_cond ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4 + %result2 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false) + + ; CHECK: atomicrmw usub_cond ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8 + %result3 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 0, i1 false) + + ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8 + %result4 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 0, i64 0, i1 false) + + ; CHECK: atomicrmw usub_cond ptr addrspace(3) %ptr3, i64 46 syncscope("agent") seq_cst, align 8 + %result5 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p3(ptr addrspace(3) %ptr3, i64 46, i64 0, i64 0, i1 false) + ret void +} + +define void @atomic_usub_sat(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) { + ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4 + %result0 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false) + + ; CHECK: atomicrmw usub_sat ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4 + %result1 = call i32 @llvm.amdgcn.atomic.csub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false) + + ; CHECK: atomicrmw usub_sat ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4 + %result2 = call i32 @llvm.amdgcn.atomic.csub.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false) + + ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8 + %result3 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 0, i1 false) + + ; CHECK: atomicrmw usub_sat ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8 + %result4 = call i64 @llvm.amdgcn.atomic.csub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 0, i64 0, i1 false) + + ; CHECK: atomicrmw usub_sat ptr addrspace(3) %ptr3, i64 46 syncscope("agent") seq_cst, align 8 + %result5 = call i64 @llvm.amdgcn.atomic.csub.i64.p3(ptr addrspace(3) %ptr3, i64 46, i64 0, i64 0, i1 false) + ret void +} + +; Test some invalid ordering handling +define void @ordering_usub_cond_usub_sat(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) { + ; CHECK: atomicrmw volatile usub_cond ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4 + %result0 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p0(ptr %ptr0, i32 42, i32 -1, i32 0, i1 true) + + ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4 + %result1 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 true) + + ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4 + %result2 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 1, i32 0, i1 false) + + ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") monotonic, align 4 + %result3 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 2, i32 0, i1 true) + + ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4 + %result4 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 false) + + ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4 + %result5 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 4, i1 true) + + ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4 + %result6 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 5, i1 false) + + ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4 + %result7 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 6, i1 true) + + ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4 + %result8 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 7, i1 false) + + ; CHECK:= atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4 + %result9 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 8, i1 true) + + ; CHECK:= atomicrmw volatile usub_sat ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4 + %result10 = call i32 @llvm.amdgcn.atomic.csub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 true) + + ; CHECK: atomicrmw volatile usub_cond ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8 + %result11 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p0(ptr %ptr0, i64 42, i64 -1, i64 0, i1 true) + + ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8 + %result12 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 0, i64 0, i1 true) + + ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8 + %result13 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 1, i64 0, i1 false) + + ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") monotonic, align 8 + %result14 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 2, i64 0, i1 true) + + ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8 + %result15 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 3, i64 0, i1 false) + + ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8 + %result16 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 4, i1 true) + + ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8 + %result17 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 5, i1 false) + + ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8 + %result18 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 6, i1 true) + + ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8 + %result19 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 7, i1 false) + + ; CHECK:= atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8 + %result20 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 8, i1 true) + + ; CHECK:= atomicrmw volatile usub_sat ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8 + %result21 = call i64 @llvm.amdgcn.atomic.csub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 3, i64 0, i1 true) + ret void +} + +define void @immarg_violations_usub_sat(ptr %ptr0, i32 %val32, i1 %val1, i64 %val64) { + ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4 + %result0 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 %val32, i32 0, i1 false) + + ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4 + %result1 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 2, i32 %val32, i1 false) + + ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4 + %result2 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 2, i32 0, i1 %val1) + + ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8 + %result3 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 %val64, i64 0, i1 false) + + ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") monotonic, align 8 + %result4 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 2, i64 %val64, i1 false) + + ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") monotonic, align 8 + %result5 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 2, i64 0, i1 %val1) + ret void +} + +declare i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0 +declare i32 @llvm.amdgcn.atomic.cond.sub.i32.p3(ptr addrspace(3) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0 +declare i32 @llvm.amdgcn.atomic.cond.sub.i32.p0(ptr nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0 +declare i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0 +declare i64 @llvm.amdgcn.atomic.cond.sub.i64.p3(ptr addrspace(3) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0 +declare i64 @llvm.amdgcn.atomic.cond.sub.i64.p0(ptr nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0 + +declare i32 @llvm.amdgcn.atomic.csub.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0 +declare i32 @llvm.amdgcn.atomic.csub.i32.p3(ptr addrspace(3) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0 +declare i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0 +declare i64 @llvm.amdgcn.atomic.csub.i64.p1(ptr addrspace(1) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0 +declare i64 @llvm.amdgcn.atomic.csub.i64.p3(ptr addrspace(3) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0 +declare i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0 + ; CHECK: !0 = !{i32 5, i32 6} ; CHECK: !1 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll deleted file mode 100644 index bff4771319454..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll +++ /dev/null @@ -1,270 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11 -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12 - -define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) { -; GFX10-LABEL: global_atomic_csub: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_atomic_csub: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: global_atomic_csub: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data seq_cst, !amdgpu.no.remote.memory !0 - ret i32 %ret -} - -define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) { -; GFX10-LABEL: global_atomic_csub_offset: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_atomic_csub_offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: global_atomic_csub_offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 - %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0 - ret i32 %ret -} - -define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) { -; GFX10-LABEL: global_atomic_csub_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_atomic_csub_nortn: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: global_atomic_csub_nortn: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data seq_cst, !amdgpu.no.remote.memory !0 - ret void -} - -define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) { -; GFX10-LABEL: global_atomic_csub_offset_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: global_atomic_csub_offset_nortn: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: global_atomic_csub_offset_nortn: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 - %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0 - ret void -} - -define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) %ptr, i32 %data) { -; GFX10-LABEL: global_atomic_csub_sgpr_base_offset: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: global_store_dword v[0:1], v0, off -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_csub_sgpr_base_offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2 -; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: global_store_b32 v[0:1], v0, off -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: global_atomic_csub_sgpr_base_offset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: global_store_b32 v[0:1], v0, off -; GFX12-NEXT: s_endpgm - %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 - %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0 - store i32 %ret, ptr addrspace(1) poison - ret void -} - -define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspace(1) %ptr, i32 %data) { -; GFX10-LABEL: global_atomic_csub_sgpr_base_offset_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_csub_sgpr_base_offset_nortn: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2 -; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-NEXT: global_wb scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: s_endpgm - %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 - %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0 - ret void -} - -attributes #0 = { nounwind willreturn } -attributes #1 = { argmemonly nounwind } - -!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll index 4af2d58b01518..d281492c647f1 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll @@ -2,190 +2,222 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s -declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3), i32) -declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1), i32) -declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32) - -define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { -; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32: +define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32(ptr %addr, i32 %in) { +; GFX12-SDAG-LABEL: flat_atomic_usub_cond_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], -16 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32: +; GFX12-GISEL-LABEL: flat_atomic_usub_cond_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i32, ptr %addr, i32 -4 - %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in) + %gep = getelementptr i32, ptr %addr, i32 -4 + %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } -define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { -; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: +define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { +; GFX12-SDAG-LABEL: flat_atomic_usub_cond_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], -16 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: +; GFX12-GISEL-LABEL: flat_atomic_usub_cond_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i32, ptr %addr, i32 -4 - %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in) + %gep = getelementptr i32, ptr %addr, i32 -4 + %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } -define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr %use) { -; GFX12-SDAG-LABEL: flat_atomic_cond_sub_rtn_u32: +define amdgpu_kernel void @flat_atomic_usub_cond_rtn_u32(ptr %addr, i32 %in, ptr %use) { +; GFX12-SDAG-LABEL: flat_atomic_usub_cond_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-SDAG-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: flat_atomic_cond_sub_rtn_u32: +; GFX12-GISEL-LABEL: flat_atomic_usub_cond_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 16 +; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-GISEL-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-GISEL-NEXT: s_endpgm entry: - %gep = getelementptr inbounds i32, ptr %addr, i32 4 - %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in) + %gep = getelementptr i32, ptr %addr, i32 4 + %val = atomicrmw usub_cond ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr %use ret void } -define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) { -; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32: +define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) { +; GFX12-SDAG-LABEL: global_atomic_usub_cond_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], -16 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32: +; GFX12-GISEL-LABEL: global_atomic_usub_cond_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 -; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4 - %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in) + %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } -define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { -; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: +define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { +; GFX12-SDAG-LABEL: global_atomic_usub_cond_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], -16 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: +; GFX12-GISEL-LABEL: global_atomic_usub_cond_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 -; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4 - %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in) + %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } -define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) { -; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32: +define amdgpu_kernel void @global_atomic_usub_cond_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) { +; GFX12-SDAG-LABEL: global_atomic_usub_cond_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: global_atomic_cond_sub_rtn_u32: +; GFX12-GISEL-LABEL: global_atomic_usub_cond_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 16 -; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %addr, i32 4 - %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in) + %val = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(1) %use ret void } -define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) { -; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32: +define amdgpu_kernel void @ds_usub_cond_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) { +; GFX12-SDAG-LABEL: ds_usub_cond_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -193,9 +225,11 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %i ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 ; GFX12-SDAG-NEXT: ds_cond_sub_u32 v0, v1 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32: +; GFX12-GISEL-LABEL: ds_usub_cond_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -203,15 +237,17 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %i ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 ; GFX12-GISEL-NEXT: ds_cond_sub_u32 v0, v1 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE ; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4 - %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in) + %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } -define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { -; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32_forced: +define amdgpu_kernel void @ds_usub_cond_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { +; GFX12-SDAG-LABEL: ds_usub_cond_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -219,9 +255,11 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 ; GFX12-SDAG-NEXT: ds_cond_sub_u32 v0, v1 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32_forced: +; GFX12-GISEL-LABEL: ds_usub_cond_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -229,38 +267,44 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, ; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 ; GFX12-GISEL-NEXT: ds_cond_sub_u32 v0, v1 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE ; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4 - %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in) + %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } -define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) { -; GFX12-SDAG-LABEL: ds_cond_sub_rtn_u32: +define amdgpu_kernel void @ds_usub_cond_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) { +; GFX12-SDAG-LABEL: ds_usub_cond_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 offset:16 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: ds_store_b32 v1, v0 ; GFX12-SDAG-NEXT: s_endpgm ; -; GFX12-GISEL-LABEL: ds_cond_sub_rtn_u32: +; GFX12-GISEL-LABEL: ds_usub_cond_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v1, v0 offset:16 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-GISEL-NEXT: ds_store_b32 v1, v0 ; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(3) %addr, i32 4 - %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in) + %val = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 store i32 %val, ptr addrspace(3) %use ret void } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll deleted file mode 100644 index 243cd59c6a821..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll +++ /dev/null @@ -1,219 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12 - -define float @raw_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -; GFX12-LABEL: raw_buffer_atomic_cond_sub_return: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -define void @raw_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -; GFX12-LABEL: raw_buffer_atomic_cond_sub_no_return: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) - ret void -} - -define void @raw_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 { -; GFX12-LABEL: raw_buffer_atomic_cond_sub_no_return_forced: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) - ret void -} - -define float @raw_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_return: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_mov_b32 s4, 4 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -define void @raw_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_no_return: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_mov_b32 s4, 4 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) - ret void -} - -define void @raw_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 { -; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_no_return_forced: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, s16 -; GFX12-NEXT: s_mov_b32 s4, 4 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) - ret void -} - -define float @struct_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -; GFX12-LABEL: struct_buffer_atomic_cond_sub_return: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s16 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -define void @struct_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -; GFX12-LABEL: struct_buffer_atomic_cond_sub_no_return: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) - ret void -} - -define void @struct_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 { -; GFX12-LABEL: struct_buffer_atomic_cond_sub_no_return_forced: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) - ret void -} - -define float @struct_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_return: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s16 -; GFX12-NEXT: s_mov_b32 s4, 4 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) - %r = bitcast i32 %orig to float - ret float %r -} - -define void @struct_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_no_return: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16 -; GFX12-NEXT: s_mov_b32 s4, 4 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) - ret void -} - -define void @struct_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 { -; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_no_return_forced: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16 -; GFX12-NEXT: s_mov_b32 s4, 4 -; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen -; GFX12-NEXT: s_setpc_b64 s[30:31] -main_body: - %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) - ret void -} - -declare i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32) #0 -declare i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 - -attributes #0 = { nounwind } -attributes #1 = { nounwind "target-features"="+atomic-csub-no-rtn-insts" } - _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
