llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> This should address the case where the result isn't fully used, resulting in partial copy bundles from the MFMA result. --- Patch is 56.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153019.diff 6 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp (+45-61) - (modified) llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir (-91) - (modified) llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir (+89-8) - (modified) llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir (+18-18) - (modified) llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir (+38-38) - (modified) llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll (+36-62) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index a8dfdbe5dd494..5206f32ec99e5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -14,8 +14,6 @@ /// MFMA opcode. /// /// TODO: -/// - Handle SplitKit partial copy bundles, and not just full copy instructions -/// /// - Update LiveIntervals incrementally instead of recomputing from scratch /// //===----------------------------------------------------------------------===// @@ -37,6 +35,7 @@ using namespace llvm; namespace { class AMDGPURewriteAGPRCopyMFMAImpl { + MachineFunction &MF; const GCNSubtarget &ST; const SIInstrInfo &TII; const SIRegisterInfo &TRI; @@ -53,7 +52,7 @@ class AMDGPURewriteAGPRCopyMFMAImpl { AMDGPURewriteAGPRCopyMFMAImpl(MachineFunction &MF, VirtRegMap &VRM, LiveRegMatrix &LRM, LiveIntervals &LIS, const RegisterClassInfo &RegClassInfo) - : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()), + : MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()), TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM), LIS(LIS), RegClassInfo(RegClassInfo) {} @@ -71,26 +70,26 @@ class AMDGPURewriteAGPRCopyMFMAImpl { /// /// \p RewriteRegs will accumulate the set of register used by those MFMAs /// that need to have the register classes adjusted. - const TargetRegisterClass *recomputeRegClassExceptRewritable( - Register Reg, const TargetRegisterClass *OldRC, - const TargetRegisterClass *NewRC, - SmallVectorImpl<MachineInstr *> &RewriteCandidates, + bool recomputeRegClassExceptRewritable( + Register Reg, SmallVectorImpl<MachineInstr *> &RewriteCandidates, SmallSetVector<Register, 4> &RewriteRegs) const; bool run(MachineFunction &MF) const; }; -const TargetRegisterClass * -AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable( - Register StartReg, const TargetRegisterClass *OldRC, - const TargetRegisterClass *NewRC, - SmallVectorImpl<MachineInstr *> &RewriteCandidates, +bool AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable( + Register StartReg, SmallVectorImpl<MachineInstr *> &RewriteCandidates, SmallSetVector<Register, 4> &RewriteRegs) const { SmallVector<Register, 8> Worklist = {StartReg}; // Recursively visit all transitive MFMA users while (!Worklist.empty()) { Register Reg = Worklist.pop_back_val(); + const TargetRegisterClass *OldRC = MRI.getRegClass(Reg); + + // Inflate to the equivalent AV_* class. + const TargetRegisterClass *NewRC = TRI.getLargestLegalSuperClass(OldRC, MF); + // Accumulate constraints from all uses. for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) { // Apply the effect of the given operand to NewRC. @@ -101,23 +100,40 @@ AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable( // either AGPR or VGPR in src0/src1, so don't bother checking the // constraint effects of the individual operands. if (isRewriteCandidate(*MI)) { - for (AMDGPU::OpName OpName : - {AMDGPU::OpName::vdst, AMDGPU::OpName::src2}) { - const MachineOperand *Op = TII.getNamedOperand(*MI, OpName); + const MachineOperand *VDst = + TII.getNamedOperand(*MI, AMDGPU::OpName::vdst); + const MachineOperand *Src2 = + TII.getNamedOperand(*MI, AMDGPU::OpName::src2); + for (const MachineOperand *Op : {VDst, Src2}) { if (!Op->isReg()) continue; Register OtherReg = Op->getReg(); - if (OtherReg != Reg) { - if (RewriteRegs.insert(OtherReg)) - Worklist.push_back(OtherReg); - } + if (OtherReg.isPhysical()) + return false; + + if (OtherReg != Reg && RewriteRegs.insert(OtherReg)) + Worklist.push_back(OtherReg); } - LLVM_DEBUG(dbgs() << "Ignoring effects of " << *MI); + if (!is_contained(RewriteCandidates, MI)) { + LLVM_DEBUG({ + Register VDstPhysReg = VRM.getPhys(VDst->getReg()); + dbgs() << "Attempting to replace VGPR MFMA with AGPR version:" + << " Dst=[" << printReg(VDst->getReg()) << " => " + << printReg(VDstPhysReg, &TRI); + + if (Src2->isReg()) { + Register Src2PhysReg = VRM.getPhys(Src2->getReg()); + dbgs() << ", Src2=[" << printReg(Src2->getReg(), &TRI) << " => " + << printReg(Src2PhysReg, &TRI); + } + + dbgs() << "]: " << MI; + }); - if (!is_contained(RewriteCandidates, MI)) RewriteCandidates.push_back(MI); + } continue; } @@ -126,13 +142,14 @@ AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable( NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, &TII, &TRI); if (!NewRC || NewRC == OldRC) { LLVM_DEBUG(dbgs() << "User of " << printReg(Reg, &TRI) - << " cannot be reassigned to AGPR: " << *MI); - return nullptr; + << " cannot be reassigned to " + << TRI.getRegClassName(NewRC) << ": " << *MI); + return false; } } } - return NewRC; + return true; } /// Attempt to reassign the registers in \p InterferingRegs to be AGPRs, with a @@ -225,10 +242,7 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { for (VNInfo *VNI : LI.vnis()) { MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def); - - // TODO: Handle SplitKit produced copy bundles for partially defined - // registers. - if (!DefMI || !DefMI->isFullCopy()) + if (!DefMI || !DefMI->isCopy()) continue; Register MFMADstReg = DefMI->getOperand(1).getReg(); @@ -241,34 +255,6 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { if (!MFMA || !isRewriteCandidate(*MFMA)) continue; - MachineOperand *Src2 = TII.getNamedOperand(*MFMA, AMDGPU::OpName::src2); - Register Src2Reg; - if (Src2->isReg()) { - Src2Reg = Src2->getReg(); - if (!Src2Reg.isVirtual()) - continue; - } - - // FIXME: getMinimalPhysRegClass returns a nonsense AV_* subclass instead - // of an AGPR or VGPR subclass, so we can't simply use the result on the - // assignment. - - LLVM_DEBUG({ - dbgs() << "Attempting to replace VGPR MFMA with AGPR version:" - << " Dst=[" << printReg(VReg) << " => " - << printReg(PhysReg, &TRI); - - if (Src2Reg) { - Register Src2PhysReg = VRM.getPhys(Src2Reg); - dbgs() << ", Src2=[" << printReg(Src2Reg, &TRI) << " => " - << printReg(Src2PhysReg, &TRI); - } - - dbgs() << "]: " << *MFMA; - }); - - const TargetRegisterClass *DstVirtRegRC = MRI.getRegClass(MFMADstReg); - // src2 and dst have the same physical class constraint; try to preserve // the original src2 subclass if one were to exist. SmallVector<MachineInstr *, 4> RewriteCandidates = {MFMA}; @@ -287,11 +273,9 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // // Note recomputeRegClassExceptRewritable will consider the constraints of // this MFMA's src2 as well as the src2/dst of any transitive MFMA users. - const TargetRegisterClass *DstExceptRC = - recomputeRegClassExceptRewritable(MFMADstReg, DstVirtRegRC, VirtRegRC, - RewriteCandidates, RewriteRegs); - if (!DstExceptRC) { - LLVM_DEBUG(dbgs() << "Could not recompute the regclass of " + if (!recomputeRegClassExceptRewritable(MFMADstReg, RewriteCandidates, + RewriteRegs)) { + LLVM_DEBUG(dbgs() << "Could not recompute the regclass of dst reg " << printReg(MFMADstReg, &TRI) << '\n'); continue; } diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir index 3103d635200c6..45c185b6154ea 100644 --- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir +++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir @@ -20,10 +20,6 @@ ret void } - define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg() #0 { - ret void - } - define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first() #1 { ret void } @@ -420,93 +416,6 @@ body: | ... -# Non-mac variant, src2 is the same VGPR, but a different subregister. ---- -name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg -tracksRegLiveness: true -machineFunctionInfo: - isEntryFunction: true - stackPtrOffsetReg: '$sgpr32' - occupancy: 10 - sgprForEXECCopy: '$sgpr100_sgpr101' -body: | - ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0 - ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0 - ; CHECK-NEXT: renamable $vgpr18_vgpr19 = COPY killed renamable $sgpr0_sgpr1 - ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc - ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc - ; CHECK-NEXT: S_BRANCH %bb.2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31:0x00000000FFFFFFFF - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 - ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 - ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 - ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 - ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 - ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 - ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) - ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1) - ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) - ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) - ; CHECK-NEXT: S_ENDPGM 0 - bb.0: - S_NOP 0, implicit-def $agpr0 - renamable $sgpr0 = S_MOV_B32 0 - undef %0.sub8:vreg_1024_align2 = V_MOV_B32_e32 0, implicit $exec - renamable $sgpr1 = COPY renamable $sgpr0 - %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1 - renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc - %0.sub9:vreg_1024_align2 = COPY %0.sub8 - - bb.1: - liveins: $vcc - - undef %0.sub0_sub1:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1) - %0.sub16_sub17:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1) - %0.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:vreg_1024_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31, 0, 0, 0, implicit $mode, implicit $exec - S_CBRANCH_VCCNZ %bb.1, implicit $vcc - S_BRANCH %bb.2 - - bb.2: - ; No VGPRs available for %0 - S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 - S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 - S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 - S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 - S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 - S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 - %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1) - GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1) - GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) - GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) - S_ENDPGM 0 - -... - # There isn't an assignable AGPR around the first MFMA. --- name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir index 3de86da766af7..735c7e081b21a 100644 --- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir +++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir @@ -1116,11 +1116,8 @@ body: | ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 - ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $agpr0_agpr1 ; CHECK-NEXT: renamable $vgpr2_vgpr3 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 killed $vgpr2_vgpr3, $vgpr2_vgpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY renamable $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_mac_e64 killed $vgpr2_vgpr3, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 @@ -1202,10 +1199,8 @@ body: | ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 ; CHECK-NEXT: renamable $vgpr0_vgpr1 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr18_vgpr19 = COPY killed renamable $agpr0_agpr1 - ; CHECK-NEXT: early-clobber renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_vgprcd_e64 killed $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 - ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY renamable $vgpr2_vgpr3_vgpr4_vgpr5 + ; CHECK-NEXT: renamable $agpr16_agpr17 = COPY killed renamable $agpr0_agpr1 + ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 killed $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 @@ -1957,3 +1952,89 @@ body: | S_ENDPGM 0 ... + +# Non-mac variant, src2 is the same VGPR, but a different subregister. +--- +name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + stackPtrOffsetReg: '$sgpr32' + occupancy: 10 + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0 + ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0 + ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0,... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/153019 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits