@@ -2326,6 +2326,20 @@ bool 
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
+    if (ST->isPreciseMemoryEnabled()) {
+      AMDGPU::Waitcnt Wait;
+      if (WCG == &WCGPreGFX12)
+        Wait = AMDGPU::Waitcnt(0, 0, 0, 0);
jwanggit86 wrote:

The option `amdgpu-waitcnt-forcezero` appears to force an s_waitcnt(0) before 
EVERY instruction.
With this example,
define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
  %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
  ret void
The .s is as follows:
; %bb.0:
        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
        flat_load_dword v4, v[0:1]
        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
        s_mov_b64 s[4:5], 0
.LBB0_1:                                ; %atomicrmw.start
                                        ; =>This Inner Loop Header: Depth=1
        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
        v_add_f32_e32 v3, v4, v2
        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
        flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
        v_cmp_eq_u32_e32 vcc, v3, v4
        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
        s_or_b64 s[4:5], vcc, s[4:5]
        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
        v_mov_b32_e32 v4, v3
        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
        s_andn2_b64 exec, exec, s[4:5]
        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
        s_cbranch_execnz .LBB0_1
; %bb.2:                                ; %atomicrmw.end
        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
        s_or_b64 exec, exec, s[4:5]
        s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
        s_setpc_b64 s[30:31]
So, it doesn't seem to be helpful for either this issue or 

cfe-commits mailing list

Reply via email to