llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> AGPRs are undesirable since they are only usable by a handful instructions like loads, stores and mfmas and everything else requires copies to/from VGPRs. Using the AGPR form should be a measure of last resort if we must use more than 256 VGPRs. --- Patch is 1.30 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159493.diff 28 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll (+128-172) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx90a.mir (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx942.mir (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/acc-ldst.ll (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll (+3-3) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll (+123-125) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll (+147-5) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll (+464-498) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll (+540-740) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll (+730-1) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll (+352-534) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll (+46-1) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll (+1006-1115) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll (+168-1050) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll (+2436-4283) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll (+50-70) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll (+553-552) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll (+2931-2) - (modified) llvm/test/CodeGen/AMDGPU/mfma-loop.ll (+950-1156) - (modified) llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll (+462-525) - (modified) llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll (+15-15) - (modified) llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/spill-agpr.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll (+51-53) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 908d856d386f5..0077c6915c520 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -37,7 +37,7 @@ static cl::opt<bool> MFMAVGPRForm( "amdgpu-mfma-vgpr-form", cl::Hidden, cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If " "unspecified, default to compiler heuristics"), - cl::init(false)); + cl::init(true)); const GCNTargetMachine &getTM(const GCNSubtarget *STI) { const SITargetLowering *TLI = STI->getTargetLowering(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll index 5720b882f4e73..2493065806794 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll @@ -15,59 +15,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GCN-NEXT: s_mov_b64 s[36:37], 1 -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1] -; GCN-NEXT: s_mov_b32 s38, 2 -; GCN-NEXT: s_mov_b32 s39, s37 +; GCN-NEXT: v_pk_mov_b32 v[32:33], s[36:37], s[36:37] op_sel:[0,1] +; GCN-NEXT: s_mov_b32 s36, 2 +; GCN-NEXT: v_pk_mov_b32 v[34:35], s[36:37], s[36:37] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 ; GCN-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[38:39], s[38:39] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a16, s16 -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 -; GCN-NEXT: v_accvgpr_write_b32 a4, s4 -; GCN-NEXT: v_accvgpr_write_b32 a5, s5 -; GCN-NEXT: v_accvgpr_write_b32 a6, s6 -; GCN-NEXT: v_accvgpr_write_b32 a7, s7 -; GCN-NEXT: v_accvgpr_write_b32 a8, s8 -; GCN-NEXT: v_accvgpr_write_b32 a9, s9 -; GCN-NEXT: v_accvgpr_write_b32 a10, s10 -; GCN-NEXT: v_accvgpr_write_b32 a11, s11 -; GCN-NEXT: v_accvgpr_write_b32 a12, s12 -; GCN-NEXT: v_accvgpr_write_b32 a13, s13 -; GCN-NEXT: v_accvgpr_write_b32 a14, s14 -; GCN-NEXT: v_accvgpr_write_b32 a15, s15 -; GCN-NEXT: v_accvgpr_write_b32 a17, s17 -; GCN-NEXT: v_accvgpr_write_b32 a18, s18 -; GCN-NEXT: v_accvgpr_write_b32 a19, s19 -; GCN-NEXT: v_accvgpr_write_b32 a20, s20 -; GCN-NEXT: v_accvgpr_write_b32 a21, s21 -; GCN-NEXT: v_accvgpr_write_b32 a22, s22 -; GCN-NEXT: v_accvgpr_write_b32 a23, s23 -; GCN-NEXT: v_accvgpr_write_b32 a24, s24 -; GCN-NEXT: v_accvgpr_write_b32 a25, s25 -; GCN-NEXT: v_accvgpr_write_b32 a26, s26 -; GCN-NEXT: v_accvgpr_write_b32 a27, s27 -; GCN-NEXT: v_accvgpr_write_b32 a28, s28 -; GCN-NEXT: v_accvgpr_write_b32 a29, s29 -; GCN-NEXT: v_accvgpr_write_b32 a30, s30 -; GCN-NEXT: v_accvgpr_write_b32 a31, s31 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[16:17], s[16:17], s[16:17] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[18:19], s[18:19], s[18:19] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[20:21], s[20:21], s[20:21] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[22:23], s[22:23], s[22:23] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[24:25], s[24:25], s[24:25] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[26:27], s[26:27], s[26:27] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[28:29], s[28:29], s[28:29] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[30:31], s[30:31], s[30:31] op_sel:[0,1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f32_32x32x4bf16_1k v[0:31], v[32:33], v[34:35], v[0:31] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v32, 0 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 -; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 -; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 -; GCN-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 -; GCN-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 -; GCN-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 -; GCN-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 +; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] +; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 +; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 +; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 +; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 +; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 +; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 +; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 ; GCN-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -83,36 +66,28 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) # ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GCN-NEXT: s_mov_b64 s[18:19], 1 -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[16:17], s[18:19], s[18:19] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s18, 2 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[18:19], s[18:19], s[18:19] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 -; GCN-NEXT: v_accvgpr_write_b32 a4, s4 -; GCN-NEXT: v_accvgpr_write_b32 a5, s5 -; GCN-NEXT: v_accvgpr_write_b32 a6, s6 -; GCN-NEXT: v_accvgpr_write_b32 a7, s7 -; GCN-NEXT: v_accvgpr_write_b32 a8, s8 -; GCN-NEXT: v_accvgpr_write_b32 a9, s9 -; GCN-NEXT: v_accvgpr_write_b32 a10, s10 -; GCN-NEXT: v_accvgpr_write_b32 a11, s11 -; GCN-NEXT: v_accvgpr_write_b32 a12, s12 -; GCN-NEXT: v_accvgpr_write_b32 a13, s13 -; GCN-NEXT: v_accvgpr_write_b32 a14, s14 -; GCN-NEXT: v_accvgpr_write_b32 a15, s15 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f32_16x16x4bf16_1k v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_nop 9 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GCN-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg @@ -128,21 +103,19 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-NEXT: s_mov_b64 s[4:5], 1 -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s4, 2 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f32_4x4x4bf16_1k v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -158,37 +131,29 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) # ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GCN-NEXT: s_mov_b64 s[18:19], 1 -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[16:17], s[18:19], s[18:19] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s18, 2 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[18:19], s[18:19], s[18:19] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 -; GCN-NEXT: v_accvgpr_write_b32 a4, s4 -; GCN-NEXT: v_accvgpr_write_b32 a5, s5 -; GCN-NEXT: v_accvgpr_write_b32 a6, s6 -; GCN-NEXT: v_accvgpr_write_b32 a7, s7 -; GCN-NEXT: v_accvgpr_write_b32 a8, s8 -; GCN-NEXT: v_accvgpr_write_b32 a9, s9 -; GCN-NEXT: v_accvgpr_write_b32 a10, s10 -; GCN-NEXT: v_accvgpr_write_b32 a11, s11 -; GCN-NEXT: v_accvgpr_write_b32 a12, s12 -; GCN-NEXT: v_accvgpr_write_b32 a13, s13 -; GCN-NEXT: v_accvgpr_write_b32 a14, s14 -; GCN-NEXT: v_accvgpr_write_b32 a15, s15 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f32_32x32x8bf16_1k v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GCN-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg @@ -204,21 +169,19 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GCN-NEXT: s_mov_b64 s[4:5], 1 -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] ; GCN-NEXT: s_mov_b32 s4, 2 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f32_16x16x16bf16_1k v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_nop 9 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GCN-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -238,12 +201,12 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GCN-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0 +; GCN-NEXT: v_mfma_f64_4x4x4f64 v[4:5], v[0:1], v[2:3], 0 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx2 v0, a[0:1], s[0:1] +; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GCN-NEXT: s_endpgm bb: %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0) @@ -258,25 +221,21 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[8:9], s[10:11], s[10:11] op_sel:[0,1] ; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 -; GCN-NEXT: v_accvgpr_write_b32 a4, s4 -; GCN-NEXT: v_accvgpr_write_b32 a5, s5 -; GCN-NEXT: v_accvgpr_write_b32 a6, s6 -; GCN-NEXT: v_accvgpr_write_b32 a7, s7 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9] -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16 +; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] +; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16 ; GCN-NEXT: s_endpgm bb: %in.1 = load <4 x double>, ptr addrspace(1) %arg @@ -291,16 +250,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) % ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0 -; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 0 +; GCN-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GCN-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 0.0>, i32 0, i32 0, i32 0) @@ -312,28 +271,26 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64_imm: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x34 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 +; GCN-NEXT: s_mov_b64 s[0:1], 0 ; GCN-NEXT: s_mov_b64 s[6:7], 1.0 -; GCN-NEXT: s_mov_b64 s[8:9], 0 -; GCN-NEXT: v_accvgpr_write_b32 a0, s8 +; GCN-NEXT: s_mov_b64 s[2:3], s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GCN-NEXT: v_accvgpr_write_b32 a2, s8 -; GCN-NEXT: v_accvgpr_write_b32 a4, s8 -; GCN-NEXT: v_accvgpr_write_b32 a6, s6 -; GCN-NEXT: v_accvgpr_write_b32 a1, s9 -; GCN-NEXT: v_accvgpr_write_b32 a3, s9 -; GCN-NEXT: v_accvgpr_write_b32 a5, s9 -; GCN-NEXT: v_accvgpr_write_b32 a7, s7 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[8:9], s[10:11], s[10:11] op_sel:[0,1] +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GCN-N... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/159493 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits