https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/154322
>From 883e110c8f86719a810c4d5a1930434af532194c Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Tue, 19 Aug 2025 21:29:05 +0900 Subject: [PATCH] AMDGPU: Add baseline test for unspilling VGPRs after MFMA rewrite Test for #154260 --- .../unspill-vgpr-after-rewrite-vgpr-mfma.ll | 454 ++++++++++++++++++ 1 file changed, 454 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll new file mode 100644 index 0000000000000..122d46b39ff32 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll @@ -0,0 +1,454 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-mfma-vgpr-form < %s | FileCheck %s + +; After reassigning the MFMA to use AGPRs, we've alleviated enough +; register pressure to try eliminating the spill of %spill with the freed +; up VGPR. +define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg, ptr addrspace(1) inreg %ptr) #0 { +; CHECK-LABEL: eliminate_spill_after_mfma_rewrite: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_accvgpr_write_b32 a3, v5 +; CHECK-NEXT: v_accvgpr_write_b32 a2, v4 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v2 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a36, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a37, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a38, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a48, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a49, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a50, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a51, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a52, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a53, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a54, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a55, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a63, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[0:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def v[32:63], v[0:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_accvgpr_write_b32 a63, v31 +; CHECK-NEXT: v_accvgpr_write_b32 a62, v30 +; CHECK-NEXT: v_accvgpr_write_b32 a61, v29 +; CHECK-NEXT: v_accvgpr_write_b32 a60, v28 +; CHECK-NEXT: v_accvgpr_write_b32 a59, v27 +; CHECK-NEXT: v_accvgpr_write_b32 a58, v26 +; CHECK-NEXT: v_accvgpr_write_b32 a57, v25 +; CHECK-NEXT: v_accvgpr_write_b32 a56, v24 +; CHECK-NEXT: v_accvgpr_write_b32 a55, v23 +; CHECK-NEXT: v_accvgpr_write_b32 a54, v22 +; CHECK-NEXT: v_accvgpr_write_b32 a53, v21 +; CHECK-NEXT: v_accvgpr_write_b32 a52, v20 +; CHECK-NEXT: v_accvgpr_write_b32 a51, v19 +; CHECK-NEXT: v_accvgpr_write_b32 a50, v18 +; CHECK-NEXT: v_accvgpr_write_b32 a49, v17 +; CHECK-NEXT: v_accvgpr_write_b32 a48, v16 +; CHECK-NEXT: v_accvgpr_write_b32 a47, v15 +; CHECK-NEXT: v_accvgpr_write_b32 a46, v14 +; CHECK-NEXT: v_accvgpr_write_b32 a45, v13 +; CHECK-NEXT: v_accvgpr_write_b32 a44, v12 +; CHECK-NEXT: v_accvgpr_write_b32 a43, v11 +; CHECK-NEXT: v_accvgpr_write_b32 a42, v10 +; CHECK-NEXT: v_accvgpr_write_b32 a41, v9 +; CHECK-NEXT: v_accvgpr_write_b32 a40, v8 +; CHECK-NEXT: v_accvgpr_write_b32 a39, v7 +; CHECK-NEXT: v_accvgpr_write_b32 a38, v6 +; CHECK-NEXT: v_accvgpr_write_b32 a37, v5 +; CHECK-NEXT: v_accvgpr_write_b32 a36, v4 +; CHECK-NEXT: v_accvgpr_write_b32 a35, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a34, v2 +; CHECK-NEXT: v_accvgpr_write_b32 a33, v1 +; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 +; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def v[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def a[0:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def a[0:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: global_store_dwordx4 v0, v[60:63], s[16:17] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[56:59], s[16:17] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[52:55], s[16:17] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[48:51], s[16:17] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[44:47], s[16:17] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[40:43], s[16:17] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[36:39], s[16:17] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[32:35], s[16:17] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[56:59], s[16:17] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[60:63], s[16:17] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[48:51], s[16:17] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[52:55], s[16:17] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[40:43], s[16:17] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[44:47], s[16:17] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[32:35], s[16:17] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a55, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a54, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a53, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a52, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a51, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a50, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a49, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a48, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a39, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a38, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a36, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a35, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a34, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %x, i32 %y, <4 x i32> %arg, i32 0, i32 0, i32 0) + %v = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0, $1", "=v,=v"() + %v0 = extractvalue { <32 x i32>, <32 x i32> } %v, 0 + %v1 = extractvalue { <32 x i32>, <32 x i32> } %v, 1 + %spill = call <4 x i32> asm sideeffect "; def $0", "=v,v"(<4 x i32> %mai) + %a0 = call <32 x i32> asm sideeffect "; def $0", "=a"() + %a1 = call <32 x i32> asm sideeffect "; def $0", "=a"() + store volatile <32 x i32> %v0, ptr addrspace(1) %ptr + store volatile <32 x i32> %v1, ptr addrspace(1) %ptr + store volatile <4 x i32> %spill, ptr addrspace(1) %ptr + ret void +} + +; Same, except we fold out 2 spills from %spill0 and %spill1 +define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %arg, ptr addrspace(1) inreg %ptr) #0 { +; CHECK-LABEL: eliminate_spill_after_mfma_rewrite_x2: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_accvgpr_write_b32 a3, v5 +; CHECK-NEXT: v_accvgpr_write_b32 a2, v4 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v2 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a36, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a37, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a38, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a47, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a48, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a49, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a50, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a51, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a52, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a53, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a54, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a55, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword a63, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[0:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def v[32:63], v[0:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_accvgpr_write_b32 a63, v31 +; CHECK-NEXT: v_accvgpr_write_b32 a62, v30 +; CHECK-NEXT: v_accvgpr_write_b32 a61, v29 +; CHECK-NEXT: v_accvgpr_write_b32 a60, v28 +; CHECK-NEXT: v_accvgpr_write_b32 a59, v27 +; CHECK-NEXT: v_accvgpr_write_b32 a58, v26 +; CHECK-NEXT: v_accvgpr_write_b32 a57, v25 +; CHECK-NEXT: v_accvgpr_write_b32 a56, v24 +; CHECK-NEXT: v_accvgpr_write_b32 a55, v23 +; CHECK-NEXT: v_accvgpr_write_b32 a54, v22 +; CHECK-NEXT: v_accvgpr_write_b32 a53, v21 +; CHECK-NEXT: v_accvgpr_write_b32 a52, v20 +; CHECK-NEXT: v_accvgpr_write_b32 a51, v19 +; CHECK-NEXT: v_accvgpr_write_b32 a50, v18 +; CHECK-NEXT: v_accvgpr_write_b32 a49, v17 +; CHECK-NEXT: v_accvgpr_write_b32 a48, v16 +; CHECK-NEXT: v_accvgpr_write_b32 a47, v15 +; CHECK-NEXT: v_accvgpr_write_b32 a46, v14 +; CHECK-NEXT: v_accvgpr_write_b32 a45, v13 +; CHECK-NEXT: v_accvgpr_write_b32 a44, v12 +; CHECK-NEXT: v_accvgpr_write_b32 a43, v11 +; CHECK-NEXT: v_accvgpr_write_b32 a42, v10 +; CHECK-NEXT: v_accvgpr_write_b32 a41, v9 +; CHECK-NEXT: v_accvgpr_write_b32 a40, v8 +; CHECK-NEXT: v_accvgpr_write_b32 a39, v7 +; CHECK-NEXT: v_accvgpr_write_b32 a38, v6 +; CHECK-NEXT: v_accvgpr_write_b32 a37, v5 +; CHECK-NEXT: v_accvgpr_write_b32 a36, v4 +; CHECK-NEXT: v_accvgpr_write_b32 a35, v3 +; CHECK-NEXT: v_accvgpr_write_b32 a34, v2 +; CHECK-NEXT: v_accvgpr_write_b32 a33, v1 +; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 +; CHECK-NEXT: v_accvgpr_read_b32 v7, a3 +; CHECK-NEXT: v_accvgpr_read_b32 v6, a2 +; CHECK-NEXT: v_accvgpr_read_b32 v5, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def v[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def v[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def a[0:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def a[0:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: global_store_dwordx4 v0, v[60:63], s[16:17] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[56:59], s[16:17] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[52:55], s[16:17] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[48:51], s[16:17] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[44:47], s[16:17] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[40:43], s[16:17] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[36:39], s[16:17] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[32:35], s[16:17] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[56:59], s[16:17] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[60:63], s[16:17] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[48:51], s[16:17] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[52:55], s[16:17] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[40:43], s[16:17] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[44:47], s[16:17] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[32:35], s[16:17] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a55, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a54, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a53, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a52, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a51, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a50, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a49, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a48, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a47, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a45, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a44, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a43, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a42, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a40, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a39, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a38, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a36, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a35, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a34, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a33, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %x, i32 %y, <4 x i32> %arg, i32 0, i32 0, i32 0) + %v = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0, $1", "=v,=v"() + %v0 = extractvalue { <32 x i32>, <32 x i32> } %v, 0 + %v1 = extractvalue { <32 x i32>, <32 x i32> } %v, 1 + %spill0 = call <4 x i32> asm sideeffect "; def $0", "=v,v"(<4 x i32> %mai) + %spill1 = call <4 x i32> asm sideeffect "; def $0", "=v,v"(<4 x i32> %mai) + %a0 = call <32 x i32> asm sideeffect "; def $0", "=a"() + %a1 = call <32 x i32> asm sideeffect "; def $0", "=a"() + store volatile <32 x i32> %v0, ptr addrspace(1) %ptr + store volatile <32 x i32> %v1, ptr addrspace(1) %ptr + store volatile <4 x i32> %spill0, ptr addrspace(1) %ptr + store volatile <4 x i32> %spill1, ptr addrspace(1) %ptr + ret void +} + +declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32 immarg, i32 immarg, i32 immarg) #1 + +attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" } +attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits