================
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; Full codegen on gfx950. Two MFMAs fed by loaded floats; three independent
+; i32 muls stored to a second buffer. sched.barrier(0) isolates the MUL+MFMA
+; region so that address-computation VALUs don't inflate the VALU gap in
+; MFMAValuSpacingOpt.
+;
+; With iglp_opt(4) the expected MFMA/VALU interleaving (ValuGap=1) is:
+; MFMA, MUL, MFMA, MUL, MUL
+;
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -o - %s | FileCheck %s
+
+declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>,
i32, i32, i32)
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare void @llvm.amdgcn.iglp.opt(i32 immarg)
+declare void @llvm.amdgcn.sched.barrier(i32 immarg)
+
+define amdgpu_kernel void @mfma_valu_iglp4(ptr addrspace(1) %p, ptr
addrspace(1) %q) #0 {
+; CHECK-LABEL: mfma_valu_iglp4:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v8, 2, v0
+; CHECK-NEXT: v_mov_b32_e32 v9, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx2 v[6:7], v8, s[0:1]
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
+; CHECK-NEXT: ; sched_barrier mask(0x00000000)
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v6, v7, v[0:3]
+; CHECK-NEXT: v_mul_lo_u32 v4, v6, v6
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v6, v7, v[0:3]
+; CHECK-NEXT: v_mul_lo_u32 v5, v6, v7
+; CHECK-NEXT: v_mul_lo_u32 v6, v7, v7
+; CHECK-NEXT: ; iglp_opt mask(0x00000004)
+; CHECK-NEXT: ; sched_barrier mask(0x00000000)
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1]
+; CHECK-NEXT: global_store_dwordx3 v8, v[4:6], s[2:3]
+; CHECK-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %t = load <4 x float>, ptr addrspace(1) %p
+ %gep0 = getelementptr inbounds float, ptr addrspace(1) %p, i32 %tid
+ %gep1 = getelementptr inbounds float, ptr addrspace(1) %gep0, i32 1
+ %f0 = load float, ptr addrspace(1) %gep0
+ %f1 = load float, ptr addrspace(1) %gep1
+ %i0 = bitcast float %f0 to i32
+ %i1 = bitcast float %f1 to i32
+ call void @llvm.amdgcn.sched.barrier(i32 0)
+ %m0 = mul nsw i32 %i0, %i0
+ %m1 = mul nsw i32 %i0, %i1
+ %m2 = mul nsw i32 %i1, %i1
+ call void @llvm.amdgcn.iglp.opt(i32 4)
+ %mai = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %f0, float
%f1, <4 x float> %t, i32 0, i32 0, i32 0)
+ %mai2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %f0,
float %f1, <4 x float> %mai, i32 0, i32 0, i32 0)
+ call void @llvm.amdgcn.sched.barrier(i32 0)
+ store <4 x float> %mai2, ptr addrspace(1) %p
+ %qgep0 = getelementptr inbounds i32, ptr addrspace(1) %q, i32 %tid
+ %qgep1 = getelementptr inbounds i32, ptr addrspace(1) %qgep0, i32 1
+ %qgep2 = getelementptr inbounds i32, ptr addrspace(1) %qgep0, i32 2
+ store i32 %m0, ptr addrspace(1) %qgep0
+ store i32 %m1, ptr addrspace(1) %qgep1
+ store i32 %m2, ptr addrspace(1) %qgep2
+ ret void
+}
+
+attributes #0 = { "uniform-work-group-size"="true" }
----------------
hidekisaito wrote:
probably not. will remove.
https://github.com/llvm/llvm-project/pull/190916
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits