https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/168017

>From 9398eaf07c6641ade0b128130738f8bcf98447e2 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <[email protected]>
Date: Thu, 13 Nov 2025 19:54:18 -0800
Subject: [PATCH] AMDGPU: Fix verifier error when waterfall call target is in
 AV register

This isn't an ideal fix; technically this should be an optimization path
we shouldn't need to go down. The base path where a copy will be inserted
is still broken.

The lit test changes are mostly regressions to be fixed later.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |   44 +-
 .../test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll |   28 +-
 .../CodeGen/AMDGPU/copy-to-reg-frameindex.ll  |   17 +-
 llvm/test/CodeGen/AMDGPU/mfma-loop.ll         | 1526 ++++++++++-------
 .../CodeGen/AMDGPU/no-fold-accvgpr-mov.ll     |    2 +-
 ...terfall-call-target-av-register-failure.ll |  141 ++
 6 files changed, 1052 insertions(+), 706 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AMDGPU/waterfall-call-target-av-register-failure.ll

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 3bf820a0024e7..f5b52425e7841 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8177,26 +8177,34 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist 
&Worklist,
       return;
     }
 
-    if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
-        NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
-      // Instead of creating a copy where src and dst are the same register
-      // class, we just replace all uses of dst with src.  These kinds of
-      // copies interfere with the heuristics MachineSink uses to decide
-      // whether or not to split a critical edge.  Since the pass assumes
-      // that copies will end up as machine instructions and not be
-      // eliminated.
-      addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
+    if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
       Register NewDstReg = Inst.getOperand(1).getReg();
-      MRI.replaceRegWith(DstReg, NewDstReg);
-      MRI.clearKillFlags(NewDstReg);
-      Inst.getOperand(0).setReg(DstReg);
-      Inst.eraseFromParent();
-      // Legalize t16 operand since replaceReg is called after addUsersToVALU
-      for (MachineOperand &MO :
-           make_early_inc_range(MRI.use_operands(NewDstReg))) {
-        legalizeOperandsVALUt16(*MO.getParent(), MRI);
+      const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
+      if (const TargetRegisterClass *CommonRC =
+              RI.getCommonSubClass(NewDstRC, SrcRC)) {
+        // Instead of creating a copy where src and dst are the same register
+        // class, we just replace all uses of dst with src.  These kinds of
+        // copies interfere with the heuristics MachineSink uses to decide
+        // whether or not to split a critical edge.  Since the pass assumes
+        // that copies will end up as machine instructions and not be
+        // eliminated.
+        addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
+        MRI.replaceRegWith(DstReg, NewDstReg);
+        MRI.clearKillFlags(NewDstReg);
+        Inst.getOperand(0).setReg(DstReg);
+
+        if (!MRI.constrainRegClass(NewDstReg, CommonRC))
+          llvm_unreachable("failed to constrain register");
+
+        Inst.eraseFromParent();
+        // Legalize t16 operand since replaceReg is called after addUsersToVALU
+        for (MachineOperand &MO :
+             make_early_inc_range(MRI.use_operands(NewDstReg))) {
+          legalizeOperandsVALUt16(*MO.getParent(), MRI);
+        }
+
+        return;
       }
-      return;
     }
 
     // If this is a v2s copy between 16bit and 32bit reg,
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll 
b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
index 196958b74442f..ae53bdff7c251 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
@@ -10733,15 +10733,16 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr 
%ptr) #0 {
 ; GFX90A-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
 ; GFX90A-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX90A-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
 ; GFX90A-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc
 ; GFX90A-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB135_6: ; %atomicrmw.phi
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -11000,15 +11001,16 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr 
%ptr) #0 {
 ; GFX90A-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
 ; GFX90A-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX90A-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
 ; GFX90A-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc
 ; GFX90A-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB137_6: ; %atomicrmw.phi
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -19023,15 +19025,16 @@ define void 
@flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
 ; GFX90A-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX90A-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
 ; GFX90A-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc
 ; GFX90A-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB243_6: ; %atomicrmw.phi
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use a[0:1]
@@ -19282,15 +19285,16 @@ define void 
@flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
 ; GFX90A-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
 ; GFX90A-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX90A-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
 ; GFX90A-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v1
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc
 ; GFX90A-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
-; GFX90A-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
+; GFX90A-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
 ; GFX90A-NEXT:  .LBB245_6: ; %atomicrmw.phi
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use a[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll 
b/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll
index aede91b76f441..a13f3513c660e 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll
@@ -43,26 +43,25 @@ define void @phi_with_alloca_and_divergent_copy_to_reg(ptr 
addrspace(5) %diverge
 ; CHECK-LABEL: phi_with_alloca_and_divergent_copy_to_reg:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_lshr_b32 s6, s32, 6
 ; CHECK-NEXT:    v_mov_b32_e32 v7, v2
 ; CHECK-NEXT:    v_mov_b32_e32 v6, v1
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s6
+; CHECK-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
 ; CHECK-NEXT:  .LBB1_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_add_u32_e32 v8, 1, v3
-; CHECK-NEXT:    v_lshl_add_u32 v5, v3, 2, v1
-; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v8
-; CHECK-NEXT:    v_mov_b32_e32 v2, v1
-; CHECK-NEXT:    v_mov_b32_e32 v1, v0
-; CHECK-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; CHECK-NEXT:    v_mov_b32_e32 v1, v2
+; CHECK-NEXT:    v_lshl_add_u32 v2, v3, 2, v1
+; CHECK-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    v_add_u32_e32 v2, 1, v3
+; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v2
 ; CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v4
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
 ; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB1_1
 ; CHECK-NEXT:  ; %bb.2: ; %done
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dword v[6:7], v0, off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll 
b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index fe432e9d7594d..331a29b3f4a93 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -101,39 +101,39 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr 
addrspace(1) %arg) #0 {
 ;
 ; GFX90A-LABEL: test_mfma_loop_zeroinit:
 ; GFX90A:       ; %bb.0: ; %entry
-; GFX90A-NEXT:    v_accvgpr_write_b32 a31, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 0
 ; GFX90A-NEXT:    s_mov_b32 s0, 16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX90A-NEXT:  .LBB0_1: ; %for.cond.preheader
@@ -160,39 +160,39 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr 
addrspace(1) %arg) #0 {
 ;
 ; GFX942-LABEL: test_mfma_loop_zeroinit:
 ; GFX942:       ; %bb.0: ; %entry
-; GFX942-NEXT:    v_accvgpr_write_b32 a31, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
 ; GFX942-NEXT:    s_mov_b32 s0, 16
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX942-NEXT:  .LBB0_1: ; %for.cond.preheader
@@ -333,6 +333,7 @@ define amdgpu_kernel void 
@test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
 ; GFX90A-LABEL: test_mfma_loop_unfoldable_splat:
 ; GFX90A:       ; %bb.0: ; %entry
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x42f60000
+; GFX90A-NEXT:    s_mov_b32 s0, 16
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v0
@@ -365,7 +366,6 @@ define amdgpu_kernel void 
@test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT:    s_mov_b32 s0, 16
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX90A-NEXT:  .LBB1_1: ; %for.cond.preheader
@@ -393,6 +393,7 @@ define amdgpu_kernel void 
@test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
 ; GFX942-LABEL: test_mfma_loop_unfoldable_splat:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x42f60000
+; GFX942-NEXT:    s_mov_b32 s0, 16
 ; GFX942-NEXT:    v_accvgpr_write_b32 a31, v0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a30, v0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a29, v0
@@ -425,7 +426,6 @@ define amdgpu_kernel void 
@test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
 ; GFX942-NEXT:    v_accvgpr_write_b32 a2, v0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a1, v0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
-; GFX942-NEXT:    s_mov_b32 s0, 16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX942-NEXT:  .LBB1_1: ; %for.cond.preheader
@@ -559,39 +559,39 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr 
addrspace(1) %arg) #0 {
 ;
 ; GFX90A-LABEL: test_mfma_loop_non_splat:
 ; GFX90A:       ; %bb.0: ; %entry
+; GFX90A-NEXT:    s_mov_b32 s0, 16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, 1.0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a31, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 0
-; GFX90A-NEXT:    s_mov_b32 s0, 16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX90A-NEXT:  .LBB2_1: ; %for.cond.preheader
@@ -618,39 +618,39 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr 
addrspace(1) %arg) #0 {
 ;
 ; GFX942-LABEL: test_mfma_loop_non_splat:
 ; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_mov_b32 s0, 16
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a1, 1.0
-; GFX942-NEXT:    v_accvgpr_write_b32 a31, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT:    s_mov_b32 s0, 16
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX942-NEXT:  .LBB2_1: ; %for.cond.preheader
@@ -821,71 +821,71 @@ define amdgpu_kernel void 
@test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
 ;
 ; GFX90A-LABEL: test_mfma_loop_unfoldable_seq:
 ; GFX90A:       ; %bb.0: ; %entry
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x431a0000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43190000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43180000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43170000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43160000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43150000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43140000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43130000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43120000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43110000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43100000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x430f0000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x430e0000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x430d0000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x430c0000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x430b0000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x430a0000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43090000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43080000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43070000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43060000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43050000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43040000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43030000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43020000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43010000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43000000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x42fe0000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x42fc0000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x42fa0000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x42f80000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x42f60000
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x42f80000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x42fa0000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x42fc0000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x42fe0000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43000000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43010000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43020000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43030000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43040000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43050000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43060000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43070000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43080000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43090000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x430a0000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x430b0000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x430c0000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x430d0000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x430e0000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x430f0000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43100000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43110000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43120000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43130000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43140000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43150000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43160000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43170000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43180000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43190000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x431a0000
 ; GFX90A-NEXT:    s_mov_b32 s0, 16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX90A-NEXT:  .LBB3_1: ; %for.cond.preheader
@@ -912,71 +912,71 @@ define amdgpu_kernel void 
@test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
 ;
 ; GFX942-LABEL: test_mfma_loop_unfoldable_seq:
 ; GFX942:       ; %bb.0: ; %entry
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x431a0000
-; GFX942-NEXT:    v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43190000
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43180000
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43170000
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43160000
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43150000
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43140000
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43130000
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43120000
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43110000
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43100000
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x430f0000
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x430e0000
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x430d0000
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x430c0000
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x430b0000
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x430a0000
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43090000
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43080000
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43070000
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43060000
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43050000
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43040000
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43030000
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43020000
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43010000
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43000000
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x42fe0000
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x42fc0000
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x42fa0000
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0x42f80000
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x42f60000
 ; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x42f80000
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x42fa0000
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x42fc0000
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x42fe0000
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43000000
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43010000
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43020000
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43030000
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43040000
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43050000
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43060000
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43070000
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43080000
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43090000
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x430a0000
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x430b0000
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x430c0000
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x430d0000
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x430e0000
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x430f0000
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43100000
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43110000
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43120000
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43130000
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43140000
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43150000
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43160000
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43170000
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43180000
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43190000
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x431a0000
 ; GFX942-NEXT:    s_mov_b32 s0, 16
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX942-NEXT:  .LBB3_1: ; %for.cond.preheader
@@ -1111,39 +1111,39 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr 
addrspace(1) %arg) #0 {
 ; GFX90A-LABEL: test_mfma_loop_vgpr_init:
 ; GFX90A:       ; %bb.0: ; %entry
 ; GFX90A-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX90A-NEXT:    s_mov_b32 s0, 16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX90A-NEXT:  .LBB4_1: ; %for.cond.preheader
@@ -1171,39 +1171,39 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr 
addrspace(1) %arg) #0 {
 ; GFX942-LABEL: test_mfma_loop_vgpr_init:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX942-NEXT:    s_mov_b32 s0, 16
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX942-NEXT:  .LBB4_1: ; %for.cond.preheader
@@ -1376,46 +1376,47 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr 
addrspace(1) %arg, float
 ;
 ; GFX90A-LABEL: test_mfma_loop_sgpr_init:
 ; GFX90A:       ; %bb.0: ; %entry
-; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x2c
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
+; GFX90A-NEXT:    s_load_dword s1, s[4:5], 0x2c
+; GFX90A-NEXT:    s_mov_b32 s0, 16
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_accvgpr_write_b32 a31, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX90A-NEXT:    s_mov_b32 s0, 16
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:  .LBB5_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
 ; GFX90A-NEXT:    s_add_i32 s0, s0, -1
 ; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
@@ -1437,46 +1438,47 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr 
addrspace(1) %arg, float
 ;
 ; GFX942-LABEL: test_mfma_loop_sgpr_init:
 ; GFX942:       ; %bb.0: ; %entry
-; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x2c
-; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT:    s_load_dword s1, s[4:5], 0x2c
+; GFX942-NEXT:    s_mov_b32 s0, 16
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_accvgpr_write_b32 a31, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT:    s_mov_b32 s0, 16
+; GFX942-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:  .LBB5_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
@@ -1641,42 +1643,43 @@ define amdgpu_kernel void 
@test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
 ;
 ; GFX90A-LABEL: test_mfma_loop_mixed_init:
 ; GFX90A:       ; %bb.0: ; %entry
-; GFX90A-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GFX90A-NEXT:    s_load_dword s1, s[4:5], 0x2c
 ; GFX90A-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a31, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, 0
+; GFX90A-NEXT:    s_mov_b32 s0, 16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a3, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT:    s_mov_b32 s0, 16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX90A-NEXT:  .LBB6_1: ; %for.cond.preheader
@@ -1703,42 +1706,43 @@ define amdgpu_kernel void 
@test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
 ;
 ; GFX942-LABEL: test_mfma_loop_mixed_init:
 ; GFX942:       ; %bb.0: ; %entry
-; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dword s1, s[4:5], 0x2c
 ; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a31, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, 0
+; GFX942-NEXT:    s_mov_b32 s0, 16
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, s0
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a3, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT:    s_mov_b32 s0, 16
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX942-NEXT:  .LBB6_1: ; %for.cond.preheader
@@ -2090,57 +2094,152 @@ define amdgpu_kernel void 
@test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
 ; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 2
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a1, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a2, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a3, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a4, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a5, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a6, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a7, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a8, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a9, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a10, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a11, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a12, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a13, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a14, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a15, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a16, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a17, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a18, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a19, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a20, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a21, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a22, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a23, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a24, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a25, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a26, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a27, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a28, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a29, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a30, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a31, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v8, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v9, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v10, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v11, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v12, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v13, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v14, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v15, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v16, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v17, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v18, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v19, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v20, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v21, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v22, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v23, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v24, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v25, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v26, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v27, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v28, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v29, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v30, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v31, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v32, a0
 ; GFX90A-NEXT:  .LBB8_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v32
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v31
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v30
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v29
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v28
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v27
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v26
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v25
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v24
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v23
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v22
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v21
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v20
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v19
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v18
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v17
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v15
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v14
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v13
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v12
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v11
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v2
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a33, a31
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a32, a30
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a31, a29
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a30, a28
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a29, a27
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a28, a26
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a27, a25
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a26, a24
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a25, a23
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a24, a22
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a23, a21
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a22, a20
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a21, a19
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a20, a18
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a19, a17
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a18, a16
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a17, a15
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a16, a14
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a15, a13
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a14, a12
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a13, a11
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a12, a10
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a11, a9
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a10, a8
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a9, a7
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a8, a6
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a7, a5
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a6, a4
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a5, a3
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a4, a2
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a3, a1
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a2, a0
 ; GFX90A-NEXT:    s_add_i32 s0, s0, -1
 ; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[2:33], v0, v1, a[2:33]
+; GFX90A-NEXT:    s_nop 15
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a0, a2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a3
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a4
+; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a5
+; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a6
+; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a7
+; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a8
+; GFX90A-NEXT:    v_accvgpr_read_b32 v8, a9
+; GFX90A-NEXT:    v_accvgpr_read_b32 v9, a10
+; GFX90A-NEXT:    v_accvgpr_read_b32 v10, a11
+; GFX90A-NEXT:    v_accvgpr_read_b32 v11, a12
+; GFX90A-NEXT:    v_accvgpr_read_b32 v12, a13
+; GFX90A-NEXT:    v_accvgpr_read_b32 v13, a14
+; GFX90A-NEXT:    v_accvgpr_read_b32 v14, a15
+; GFX90A-NEXT:    v_accvgpr_read_b32 v15, a16
+; GFX90A-NEXT:    v_accvgpr_read_b32 v16, a17
+; GFX90A-NEXT:    v_accvgpr_read_b32 v17, a18
+; GFX90A-NEXT:    v_accvgpr_read_b32 v18, a19
+; GFX90A-NEXT:    v_accvgpr_read_b32 v19, a20
+; GFX90A-NEXT:    v_accvgpr_read_b32 v20, a21
+; GFX90A-NEXT:    v_accvgpr_read_b32 v21, a22
+; GFX90A-NEXT:    v_accvgpr_read_b32 v22, a23
+; GFX90A-NEXT:    v_accvgpr_read_b32 v23, a24
+; GFX90A-NEXT:    v_accvgpr_read_b32 v24, a25
+; GFX90A-NEXT:    v_accvgpr_read_b32 v25, a26
+; GFX90A-NEXT:    v_accvgpr_read_b32 v26, a27
+; GFX90A-NEXT:    v_accvgpr_read_b32 v27, a28
+; GFX90A-NEXT:    v_accvgpr_read_b32 v28, a29
+; GFX90A-NEXT:    v_accvgpr_read_b32 v29, a30
+; GFX90A-NEXT:    v_accvgpr_read_b32 v30, a31
+; GFX90A-NEXT:    v_accvgpr_read_b32 v31, a32
+; GFX90A-NEXT:    v_accvgpr_read_b32 v32, a33
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX90A-NEXT:  ; %bb.2: ; %exit
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_nop 12
-; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX90A-NEXT:    global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[30:33], s[0:1] offset:112
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[26:29], s[0:1] offset:96
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[22:25], s[0:1] offset:80
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[18:21], s[0:1] offset:64
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[14:17], s[0:1] offset:48
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[10:13], s[0:1] offset:32
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[6:9], s[0:1] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[2:5], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: test_mfma_loop_agpr_init:
@@ -2152,57 +2251,152 @@ define amdgpu_kernel void 
@test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0
 ; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a7, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a8, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a9, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a10, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a11, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a12, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a13, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a14, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a15, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a16, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a17, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a18, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a19, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a20, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a21, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a22, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a23, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a24, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a25, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a26, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a27, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a28, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a29, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a30, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a31, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v2, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v3, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v4, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v5, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v6, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v7, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v8, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v9, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v10, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v11, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v12, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v13, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v14, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v15, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v16, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v17, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v18, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v19, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v20, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v21, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v22, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v23, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v24, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v25, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v26, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v27, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v28, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v29, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v30, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v31, a0
+; GFX942-NEXT:    v_accvgpr_read_b32 v32, a0
 ; GFX942-NEXT:  .LBB8_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, v32
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, v31
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, v30
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, v29
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, v28
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, v27
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, v26
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, v25
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, v24
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, v23
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, v22
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, v21
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, v20
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, v19
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, v18
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, v17
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, v16
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, v15
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, v14
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, v13
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, v12
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, v11
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, v10
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, v9
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v8
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, v7
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, v6
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, v5
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v4
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v3
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v2
+; GFX942-NEXT:    v_accvgpr_mov_b32 a33, a31
+; GFX942-NEXT:    v_accvgpr_mov_b32 a32, a30
+; GFX942-NEXT:    v_accvgpr_mov_b32 a31, a29
+; GFX942-NEXT:    v_accvgpr_mov_b32 a30, a28
+; GFX942-NEXT:    v_accvgpr_mov_b32 a29, a27
+; GFX942-NEXT:    v_accvgpr_mov_b32 a28, a26
+; GFX942-NEXT:    v_accvgpr_mov_b32 a27, a25
+; GFX942-NEXT:    v_accvgpr_mov_b32 a26, a24
+; GFX942-NEXT:    v_accvgpr_mov_b32 a25, a23
+; GFX942-NEXT:    v_accvgpr_mov_b32 a24, a22
+; GFX942-NEXT:    v_accvgpr_mov_b32 a23, a21
+; GFX942-NEXT:    v_accvgpr_mov_b32 a22, a20
+; GFX942-NEXT:    v_accvgpr_mov_b32 a21, a19
+; GFX942-NEXT:    v_accvgpr_mov_b32 a20, a18
+; GFX942-NEXT:    v_accvgpr_mov_b32 a19, a17
+; GFX942-NEXT:    v_accvgpr_mov_b32 a18, a16
+; GFX942-NEXT:    v_accvgpr_mov_b32 a17, a15
+; GFX942-NEXT:    v_accvgpr_mov_b32 a16, a14
+; GFX942-NEXT:    v_accvgpr_mov_b32 a15, a13
+; GFX942-NEXT:    v_accvgpr_mov_b32 a14, a12
+; GFX942-NEXT:    v_accvgpr_mov_b32 a13, a11
+; GFX942-NEXT:    v_accvgpr_mov_b32 a12, a10
+; GFX942-NEXT:    v_accvgpr_mov_b32 a11, a9
+; GFX942-NEXT:    v_accvgpr_mov_b32 a10, a8
+; GFX942-NEXT:    v_accvgpr_mov_b32 a9, a7
+; GFX942-NEXT:    v_accvgpr_mov_b32 a8, a6
+; GFX942-NEXT:    v_accvgpr_mov_b32 a7, a5
+; GFX942-NEXT:    v_accvgpr_mov_b32 a6, a4
+; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a3
+; GFX942-NEXT:    v_accvgpr_mov_b32 a4, a2
+; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a1
+; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[2:33], v0, v1, a[2:33]
+; GFX942-NEXT:    s_nop 15
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_accvgpr_mov_b32 a0, a2
+; GFX942-NEXT:    v_accvgpr_read_b32 v2, a3
+; GFX942-NEXT:    v_accvgpr_read_b32 v3, a4
+; GFX942-NEXT:    v_accvgpr_read_b32 v4, a5
+; GFX942-NEXT:    v_accvgpr_read_b32 v5, a6
+; GFX942-NEXT:    v_accvgpr_read_b32 v6, a7
+; GFX942-NEXT:    v_accvgpr_read_b32 v7, a8
+; GFX942-NEXT:    v_accvgpr_read_b32 v8, a9
+; GFX942-NEXT:    v_accvgpr_read_b32 v9, a10
+; GFX942-NEXT:    v_accvgpr_read_b32 v10, a11
+; GFX942-NEXT:    v_accvgpr_read_b32 v11, a12
+; GFX942-NEXT:    v_accvgpr_read_b32 v12, a13
+; GFX942-NEXT:    v_accvgpr_read_b32 v13, a14
+; GFX942-NEXT:    v_accvgpr_read_b32 v14, a15
+; GFX942-NEXT:    v_accvgpr_read_b32 v15, a16
+; GFX942-NEXT:    v_accvgpr_read_b32 v16, a17
+; GFX942-NEXT:    v_accvgpr_read_b32 v17, a18
+; GFX942-NEXT:    v_accvgpr_read_b32 v18, a19
+; GFX942-NEXT:    v_accvgpr_read_b32 v19, a20
+; GFX942-NEXT:    v_accvgpr_read_b32 v20, a21
+; GFX942-NEXT:    v_accvgpr_read_b32 v21, a22
+; GFX942-NEXT:    v_accvgpr_read_b32 v22, a23
+; GFX942-NEXT:    v_accvgpr_read_b32 v23, a24
+; GFX942-NEXT:    v_accvgpr_read_b32 v24, a25
+; GFX942-NEXT:    v_accvgpr_read_b32 v25, a26
+; GFX942-NEXT:    v_accvgpr_read_b32 v26, a27
+; GFX942-NEXT:    v_accvgpr_read_b32 v27, a28
+; GFX942-NEXT:    v_accvgpr_read_b32 v28, a29
+; GFX942-NEXT:    v_accvgpr_read_b32 v29, a30
+; GFX942-NEXT:    v_accvgpr_read_b32 v30, a31
+; GFX942-NEXT:    v_accvgpr_read_b32 v31, a32
+; GFX942-NEXT:    v_accvgpr_read_b32 v32, a33
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB8_1
-; GFX942-NEXT:  ; %bb.2: ; %exit
-; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    s_nop 11
-; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942-NEXT:    global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT:  ; %bb.2: ; %exit
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx4 v0, a[30:33], s[0:1] offset:112
+; GFX942-NEXT:    global_store_dwordx4 v0, a[26:29], s[0:1] offset:96
+; GFX942-NEXT:    global_store_dwordx4 v0, a[22:25], s[0:1] offset:80
+; GFX942-NEXT:    global_store_dwordx4 v0, a[18:21], s[0:1] offset:64
+; GFX942-NEXT:    global_store_dwordx4 v0, a[14:17], s[0:1] offset:48
+; GFX942-NEXT:    global_store_dwordx4 v0, a[10:13], s[0:1] offset:32
+; GFX942-NEXT:    global_store_dwordx4 v0, a[6:9], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, a[2:5], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 entry:
   %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, 
float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
@@ -2609,39 +2803,39 @@ define <32 x float> @test_mfma_loop_zeroinit_ret_use() 
#0 {
 ; GFX90A-LABEL: test_mfma_loop_zeroinit_ret_use:
 ; GFX90A:       ; %bb.0: ; %entry
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_accvgpr_write_b32 a31, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 0
 ; GFX90A-NEXT:    s_mov_b32 s4, 16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX90A-NEXT:  .LBB10_1: ; %for.cond.preheader
@@ -2690,39 +2884,39 @@ define <32 x float> @test_mfma_loop_zeroinit_ret_use() 
#0 {
 ; GFX942-LABEL: test_mfma_loop_zeroinit_ret_use:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_accvgpr_write_b32 a31, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
 ; GFX942-NEXT:    s_mov_b32 s0, 16
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX942-NEXT:  .LBB10_1: ; %for.cond.preheader
@@ -2867,39 +3061,39 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() 
#0 {
 ; GFX90A-LABEL: test_mfma_loop_non_splat_ret_use:
 ; GFX90A:       ; %bb.0: ; %entry
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b32 s4, 16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, 1.0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a31, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 0
-; GFX90A-NEXT:    s_mov_b32 s4, 16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX90A-NEXT:  .LBB11_1: ; %for.cond.preheader
@@ -2948,39 +3142,39 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() 
#0 {
 ; GFX942-LABEL: test_mfma_loop_non_splat_ret_use:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s0, 16
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a1, 1.0
-; GFX942-NEXT:    v_accvgpr_write_b32 a31, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT:    s_mov_b32 s0, 16
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, 0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX942-NEXT:  .LBB11_1: ; %for.cond.preheader
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll 
b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
index 2462414992e36..12efca7dcadb5 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -6,8 +6,8 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
 ; GFX942-LABEL: matmul_kernel:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a1, 0
 ; GFX942-NEXT:    s_mov_b32 s3, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
diff --git 
a/llvm/test/CodeGen/AMDGPU/waterfall-call-target-av-register-failure.ll 
b/llvm/test/CodeGen/AMDGPU/waterfall-call-target-av-register-failure.ll
new file mode 100644
index 0000000000000..93d864246d68d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waterfall-call-target-av-register-failure.ll
@@ -0,0 +1,141 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 6
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < 
%s | FileCheck %s
+
+; Make sure SIFixSGPRCopies handles situations where it needs to fix
+; up copies to physical registers from an AV virtual register.
+
+define i32 @fix_sgpr_copies_indirect_call(ptr addrspace(5) %ptr) {
+; CHECK-LABEL: fix_sgpr_copies_indirect_call:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s16, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte 
Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:20 ; 4-byte 
Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
+; CHECK-NEXT:    v_writelane_b32 v40, s16, 4
+; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
+; CHECK-NEXT:    s_add_i32 s32, s32, 0x800
+; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
+; CHECK-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:12 ; 4-byte 
Folded Spill
+; CHECK-NEXT:    v_mov_b32_e32 v1, v0
+; CHECK-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
+; CHECK-NEXT:    v_writelane_b32 v41, s15, 0
+; CHECK-NEXT:    v_writelane_b32 v41, s14, 1
+; CHECK-NEXT:    v_writelane_b32 v41, s13, 2
+; CHECK-NEXT:    v_writelane_b32 v41, s12, 3
+; CHECK-NEXT:    v_writelane_b32 v41, s10, 4
+; CHECK-NEXT:    v_writelane_b32 v41, s11, 5
+; CHECK-NEXT:    v_writelane_b32 v41, s8, 6
+; CHECK-NEXT:    v_writelane_b32 v41, s9, 7
+; CHECK-NEXT:    v_writelane_b32 v41, s6, 8
+; CHECK-NEXT:    v_writelane_b32 v41, s7, 9
+; CHECK-NEXT:    v_writelane_b32 v41, s4, 10
+; CHECK-NEXT:    v_writelane_b32 v41, s5, 11
+; CHECK-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[34:35]
+; CHECK-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, v2
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte 
Folded Spill
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte 
Folded Spill
+; CHECK-NEXT:  ; %bb.1: ; %bb1
+; CHECK-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[4:5], exec
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_writelane_b32 v41, s4, 12
+; CHECK-NEXT:    v_writelane_b32 v41, s5, 13
+; CHECK-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[34:35]
+; CHECK-NEXT:  .LBB0_2: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[34:35]
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte 
Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte 
Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    v_readfirstlane_b32 s6, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_readfirstlane_b32 s8, v1
+; CHECK-NEXT:    s_mov_b32 s4, s6
+; CHECK-NEXT:    s_mov_b32 s5, s8
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; CHECK-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; CHECK-NEXT:    s_mov_b32 s7, s8
+; CHECK-NEXT:    v_writelane_b32 v41, s6, 14
+; CHECK-NEXT:    v_writelane_b32 v41, s7, 15
+; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; CHECK-NEXT:    v_writelane_b32 v41, s4, 16
+; CHECK-NEXT:    v_writelane_b32 v41, s5, 17
+; CHECK-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[34:35]
+; CHECK-NEXT:  ; %bb.3: ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[34:35]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_readlane_b32 s16, v41, 14
+; CHECK-NEXT:    v_readlane_b32 s17, v41, 15
+; CHECK-NEXT:    v_readlane_b32 s15, v41, 0
+; CHECK-NEXT:    v_readlane_b32 s14, v41, 1
+; CHECK-NEXT:    v_readlane_b32 s13, v41, 2
+; CHECK-NEXT:    v_readlane_b32 s12, v41, 3
+; CHECK-NEXT:    v_readlane_b32 s10, v41, 4
+; CHECK-NEXT:    v_readlane_b32 s11, v41, 5
+; CHECK-NEXT:    v_readlane_b32 s8, v41, 6
+; CHECK-NEXT:    v_readlane_b32 s9, v41, 7
+; CHECK-NEXT:    v_readlane_b32 s6, v41, 8
+; CHECK-NEXT:    v_readlane_b32 s7, v41, 9
+; CHECK-NEXT:    v_readlane_b32 s4, v41, 10
+; CHECK-NEXT:    v_readlane_b32 s5, v41, 11
+; CHECK-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:12 ; 4-byte 
Folded Reload
+; CHECK-NEXT:    s_mov_b64 s[22:23], s[2:3]
+; CHECK-NEXT:    s_mov_b64 s[20:21], s[0:1]
+; CHECK-NEXT:    s_mov_b64 s[0:1], s[20:21]
+; CHECK-NEXT:    s_mov_b64 s[2:3], s[22:23]
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_readlane_b32 s4, v41, 16
+; CHECK-NEXT:    v_readlane_b32 s5, v41, 17
+; CHECK-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_2
+; CHECK-NEXT:  ; %bb.4:
+; CHECK-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[34:35]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_readlane_b32 s4, v41, 12
+; CHECK-NEXT:    v_readlane_b32 s5, v41, 13
+; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    v_readlane_b32 s4, v40, 4
+; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
+; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
+; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte 
Folded Reload
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:20 ; 4-byte 
Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
+; CHECK-NEXT:    s_mov_b32 s33, s4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %i = load ptr, ptr addrspace(5) %ptr, align 8
+  br label %bb1
+
+bb1:                                              ; preds = %bb
+  tail call void %i()
+  ret i32 0
+}

_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to