================ @@ -2326,6 +2326,20 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } #endif + if (ST->isPreciseMemoryEnabled()) { + AMDGPU::Waitcnt Wait; + if (WCG == &WCGPreGFX12) + Wait = AMDGPU::Waitcnt(0, 0, 0, 0); ---------------- jwanggit86 wrote:
The option `amdgpu-waitcnt-forcezero` appears to force an s_waitcnt(0) before EVERY instruction. With this example, ``` define void @syncscope_workgroup_nortn(ptr %addr, float %val) { %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst ret void } ``` The .s is as follows: ``` ; %bb.0: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) flat_load_dword v4, v[0:1] s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) s_mov_b64 s[4:5], 0 .LBB0_1: ; %atomicrmw.start ; =>This Inner Loop Header: Depth=1 s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) v_add_f32_e32 v3, v4, v2 s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) flat_atomic_cmpswap v3, v[0:1], v[3:4] glc s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) v_cmp_eq_u32_e32 vcc, v3, v4 s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) s_or_b64 s[4:5], vcc, s[4:5] s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) v_mov_b32_e32 v4, v3 s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) s_andn2_b64 exec, exec, s[4:5] s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) s_cbranch_execnz .LBB0_1 ; %bb.2: ; %atomicrmw.end s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) s_or_b64 exec, exec, s[4:5] s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) s_setpc_b64 s[30:31] ``` So, it doesn't seem to be helpful for either this issue or [issue#66](https://github.com/ROCm/ROCm-CompilerSupport/issues/66). https://github.com/llvm/llvm-project/pull/79236 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits