https://github.com/adelejjeh updated https://github.com/llvm/llvm-project/pull/188116
>From 1283298a74c6ae99472117a3e41a75f8783ddc0d Mon Sep 17 00:00:00 2001 From: Adel Ejjeh <[email protected]> Date: Thu, 12 Mar 2026 10:09:35 -0500 Subject: [PATCH] [AMDGPU][DAGCombiner][GlobalISel] Extend allMulUsesCanBeContracted with FPEXT pattern Extend the allMulUsesCanBeContracted analysis to recognize FPEXT patterns where the multiply result flows through fpext before being used in contractable operations (fadd, fsub). This covers: - fmul --> fpext --> {fadd, fsub}: FPEXT folds if isFPExtFoldable - fmul --> fpext --> fneg --> fsub: FPEXT then FNEG to FSUB - fmul --> fneg --> fpext --> fsub: FNEG then FPEXT folds if foldable Also adds allMulUsesCanBeContracted guards to all FPEXT fold sites in both SDAG (visitFADDForFMACombine, visitFSUBForFMACombine) and GISel (matchCombineFAddFpExtFMulToFMadOrFMA, matchCombineFSubFpExtFMulToFMadOrFMA, matchCombineFSubFpExtFNegFMulToFMadOrFMA). Fixes a missing isFPExtFoldable check in GISel's matchCombineFSubFpExtFMulToFMadOrFMA which could fold without verifying the extension is actually foldable. Co-Authored-By: Claude Opus 4.6 <[email protected]> Made-with: Cursor --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 3 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 102 ++- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 91 ++- .../AMDGPU/fma-multiple-uses-contraction.ll | 680 ++++++------------ 4 files changed, 390 insertions(+), 486 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 09c827f71a34d..8440fdcbbd08b 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -805,7 +805,8 @@ class CombinerHelper { /// Check if all uses of a multiply can be contracted into fma/fmad /// operations, so that duplicating the multiply is acceptable. - bool allMulUsesCanBeContracted(const MachineInstr &MI) const; + bool allMulUsesCanBeContracted(const MachineInstr &MI, + unsigned PreferredFusedOpcode) const; bool canCombineFMadOrFMA(MachineInstr &MI, bool &AllowFusionGlobally, bool &HasFMAD, bool &Aggressive, diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index d2bf2568df276..0941e6da0f40f 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -6316,10 +6316,15 @@ static bool hasMoreUses(const MachineInstr &MI0, const MachineInstr &MI1, /// would duplicate the multiply without reducing the total number of /// operations. /// -/// Currently checks for the following patterns: +/// This uses a simple, non-recursive check for the following patterns: /// - fmul --> fadd/fsub: Direct contraction /// - fmul --> fneg --> fsub: Contraction through fneg -bool CombinerHelper::allMulUsesCanBeContracted(const MachineInstr &MI) const { +/// - fmul --> fneg --> fpext --> fsub: FNEG then FPEXT folds if foldable +/// - fmul --> fpext --> {fadd, fsub}: FPEXT folds if foldable +/// - fmul --> fpext --> fneg --> fsub: FPEXT then FNEG to FSUB +bool CombinerHelper::allMulUsesCanBeContracted( + const MachineInstr &MI, unsigned PreferredFusedOpcode) const { + const auto &TLI = getTargetLowering(); Register MulReg = MI.getOperand(0).getReg(); for (const MachineInstr &UseMI : MRI.use_nodbg_instructions(MulReg)) { @@ -6329,13 +6334,66 @@ bool CombinerHelper::allMulUsesCanBeContracted(const MachineInstr &MI) const { if (Opcode == TargetOpcode::G_FADD || Opcode == TargetOpcode::G_FSUB) continue; - // G_FNEG use - contractable if all users of the fneg are G_FSUB. + // FNEG --> FSUB pattern + // Also handles FNEG --> FPEXT --> FSUB if (Opcode == TargetOpcode::G_FNEG) { Register FNegReg = UseMI.getOperand(0).getReg(); - for (const MachineInstr &FNegUser : MRI.use_nodbg_instructions(FNegReg)) { - unsigned FNegUserOp = FNegUser.getOpcode(); - if (FNegUserOp != TargetOpcode::G_FSUB) + // ALL users of the FNEG must be contractable FSUBs or FPEXTs leading to + // FSUBs + for (const MachineInstr &FNegUseMI : + MRI.use_nodbg_instructions(FNegReg)) { + unsigned FNegUseOpcode = FNegUseMI.getOpcode(); + + if (FNegUseOpcode == TargetOpcode::G_FSUB) + continue; + if (FNegUseOpcode == TargetOpcode::G_FPEXT) { + // FNEG --> FPEXT --> FSUB + Register FNegFPExtReg = FNegUseMI.getOperand(0).getReg(); + for (const MachineInstr &FNegFPExtUseMI : + MRI.use_nodbg_instructions(FNegFPExtReg)) { + if (FNegFPExtUseMI.getOpcode() != TargetOpcode::G_FSUB) + return false; + // FPEXT use is FSUB, check if can be folded in + if (!TLI.isFPExtFoldable( + FNegFPExtUseMI, PreferredFusedOpcode, + MRI.getType(FNegFPExtUseMI.getOperand(0).getReg()), + MRI.getType(FNegReg))) + return false; + } + continue; + } + return false; + } + continue; + } + + // FP_EXTEND - check if ALL users are FADD, FSUB, or FNEG --> FSUB + if (Opcode == TargetOpcode::G_FPEXT) { + Register FPExtReg = UseMI.getOperand(0).getReg(); + + // ALL users of the FP_EXTEND must be contractable operations or FNEGs + for (const MachineInstr &FPExtUseMI : + MRI.use_nodbg_instructions(FPExtReg)) { + if (!TLI.isFPExtFoldable(FPExtUseMI, PreferredFusedOpcode, + MRI.getType(FPExtUseMI.getOperand(0).getReg()), + MRI.getType(MulReg))) return false; + unsigned ExtUseOpcode = FPExtUseMI.getOpcode(); + if (ExtUseOpcode == TargetOpcode::G_FADD || + ExtUseOpcode == TargetOpcode::G_FSUB) { + continue; + } + if (ExtUseOpcode == TargetOpcode::G_FNEG) { + // FP_EXTEND --> FNEG --> FSUB + Register FPExtFNegReg = FPExtUseMI.getOperand(0).getReg(); + for (const MachineInstr &FPExtFNegUseMI : + MRI.use_nodbg_instructions(FPExtFNegReg)) { + if (FPExtFNegUseMI.getOpcode() != TargetOpcode::G_FSUB) + return false; + } + continue; + } + return false; } continue; } @@ -6407,7 +6465,8 @@ bool CombinerHelper::matchCombineFAddFMulToFMadOrFMA( // avoiding duplication of the multiply without reducing total operations. if (isContractableFMul(*LHS.MI, AllowFusionGlobally) && (MRI.hasOneNonDBGUse(LHS.Reg) || - (Aggressive && allMulUsesCanBeContracted(*LHS.MI)))) { + (Aggressive && + allMulUsesCanBeContracted(*LHS.MI, PreferredFusedOpcode)))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, {LHS.MI->getOperand(1).getReg(), @@ -6421,7 +6480,8 @@ bool CombinerHelper::matchCombineFAddFMulToFMadOrFMA( // avoiding duplication of the multiply without reducing total operations. if (isContractableFMul(*RHS.MI, AllowFusionGlobally) && (MRI.hasOneNonDBGUse(RHS.Reg) || - (Aggressive && allMulUsesCanBeContracted(*RHS.MI)))) { + (Aggressive && + allMulUsesCanBeContracted(*RHS.MI, PreferredFusedOpcode)))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, {RHS.MI->getOperand(1).getReg(), @@ -6464,6 +6524,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMA( MachineInstr *FpExtSrc; if (mi_match(LHS.Reg, MRI, m_GFPExt(m_MInstr(FpExtSrc))) && isContractableFMul(*FpExtSrc, AllowFusionGlobally) && + allMulUsesCanBeContracted(*FpExtSrc, PreferredFusedOpcode) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, MRI.getType(FpExtSrc->getOperand(1).getReg()))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { @@ -6479,6 +6540,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMA( // Note: Commutes FADD operands. if (mi_match(RHS.Reg, MRI, m_GFPExt(m_MInstr(FpExtSrc))) && isContractableFMul(*FpExtSrc, AllowFusionGlobally) && + allMulUsesCanBeContracted(*FpExtSrc, PreferredFusedOpcode) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType, MRI.getType(FpExtSrc->getOperand(1).getReg()))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { @@ -6718,7 +6780,8 @@ bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA( if (FirstMulHasFewerUses && (isContractableFMul(*LHS.MI, AllowFusionGlobally) && (MRI.hasOneNonDBGUse(LHS.Reg) || - (Aggressive && allMulUsesCanBeContracted(*LHS.MI))))) { + (Aggressive && + allMulUsesCanBeContracted(*LHS.MI, PreferredFusedOpcode))))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { Register NegZ = B.buildFNeg(DstTy, RHS.Reg).getReg(0); B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, @@ -6732,7 +6795,8 @@ bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA( // avoiding duplication of the multiply without reducing total operations. if (isContractableFMul(*RHS.MI, AllowFusionGlobally) && (MRI.hasOneNonDBGUse(RHS.Reg) || - (Aggressive && allMulUsesCanBeContracted(*RHS.MI)))) { + (Aggressive && + allMulUsesCanBeContracted(*RHS.MI, PreferredFusedOpcode)))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { Register NegY = B.buildFNeg(DstTy, RHS.MI->getOperand(1).getReg()).getReg(0); @@ -6771,7 +6835,8 @@ bool CombinerHelper::matchCombineFSubFNegFMulToFMadOrFMA( isContractableFMul(*FMulMI, AllowFusionGlobally) && ((MRI.hasOneNonDBGUse(LHSReg) && MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg())) || - (Aggressive && allMulUsesCanBeContracted(*FMulMI)))) { + (Aggressive && + allMulUsesCanBeContracted(*FMulMI, PreferredFusedOpcode)))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { Register NegX = B.buildFNeg(DstTy, FMulMI->getOperand(1).getReg()).getReg(0); @@ -6795,7 +6860,8 @@ bool CombinerHelper::matchCombineFSubFNegFMulToFMadOrFMA( isContractableFMul(*FMulMI, AllowFusionGlobally) && ((MRI.hasOneNonDBGUse(RHSReg) && MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg())) || - (Aggressive && allMulUsesCanBeContracted(*FMulMI)))) { + (Aggressive && + allMulUsesCanBeContracted(*FMulMI, PreferredFusedOpcode)))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()}, {FMulMI->getOperand(1).getReg(), @@ -6828,7 +6894,10 @@ bool CombinerHelper::matchCombineFSubFpExtFMulToFMadOrFMA( // fold (fsub (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), (fneg z)) if (mi_match(LHSReg, MRI, m_GFPExt(m_MInstr(FMulMI))) && isContractableFMul(*FMulMI, AllowFusionGlobally) && - (Aggressive || MRI.hasOneNonDBGUse(LHSReg)) && + ((MRI.hasOneNonDBGUse(LHSReg) && + MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg())) || + (Aggressive && + allMulUsesCanBeContracted(*FMulMI, PreferredFusedOpcode))) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy, MRI.getType(FMulMI->getOperand(0).getReg()))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { @@ -6846,7 +6915,10 @@ bool CombinerHelper::matchCombineFSubFpExtFMulToFMadOrFMA( // fold (fsub x, (fpext (fmul y, z))) -> (fma (fneg (fpext y)), (fpext z), x) if (mi_match(RHSReg, MRI, m_GFPExt(m_MInstr(FMulMI))) && isContractableFMul(*FMulMI, AllowFusionGlobally) && - (Aggressive || MRI.hasOneNonDBGUse(RHSReg)) && + ((MRI.hasOneNonDBGUse(RHSReg) && + MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg())) || + (Aggressive && + allMulUsesCanBeContracted(*FMulMI, PreferredFusedOpcode))) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy, MRI.getType(FMulMI->getOperand(0).getReg()))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { @@ -6896,6 +6968,7 @@ bool CombinerHelper::matchCombineFSubFpExtFNegFMulToFMadOrFMA( if ((mi_match(LHSReg, MRI, m_GFPExt(m_GFNeg(m_MInstr(FMulMI)))) || mi_match(LHSReg, MRI, m_GFNeg(m_GFPExt(m_MInstr(FMulMI))))) && isContractableFMul(*FMulMI, AllowFusionGlobally) && + allMulUsesCanBeContracted(*FMulMI, PreferredFusedOpcode) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy, MRI.getType(FMulMI->getOperand(0).getReg()))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { @@ -6912,6 +6985,7 @@ bool CombinerHelper::matchCombineFSubFpExtFNegFMulToFMadOrFMA( if ((mi_match(RHSReg, MRI, m_GFPExt(m_GFNeg(m_MInstr(FMulMI)))) || mi_match(RHSReg, MRI, m_GFNeg(m_GFPExt(m_MInstr(FMulMI))))) && isContractableFMul(*FMulMI, AllowFusionGlobally) && + allMulUsesCanBeContracted(*FMulMI, PreferredFusedOpcode) && TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy, MRI.getType(FMulMI->getOperand(0).getReg()))) { MatchInfo = [=, &MI](MachineIRBuilder &B) { diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ffcdb9c40bb81..e4f1c8adc8abe 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17685,25 +17685,75 @@ static bool isFusedOp(const MatchContextClass &Matcher, SDValue N) { /// would duplicate the multiply without reducing the total number of /// operations. /// -/// Currently checks for the following patterns: +/// This uses a simple, non-recursive check for the following patterns: /// - fmul --> fadd/fsub: Direct contraction /// - fmul --> fneg --> fsub: Contraction through fneg -static bool allMulUsesCanBeContracted(SDValue Mul) { +/// - fmul --> fneg --> fpext --> fsub: FNEG then FPEXT folds if foldable +/// - fmul --> fpext --> {fadd, fsub}: FPEXT folds if foldable +/// - fmul --> fpext --> fneg --> fsub: FPEXT then FNEG to FSUB +static bool allMulUsesCanBeContracted(SDValue Mul, + const unsigned PreferredFusedOpcode, + const TargetLowering &TLI, + SelectionDAG &DAG) { for (const auto *User : Mul->users()) { - unsigned Opcode = User->getOpcode(); + SDNode *UserNode = const_cast<SDNode *>(User); + unsigned Opcode = UserNode->getOpcode(); // Direct FADD/FSUB - contractable. if (Opcode == ISD::FADD || Opcode == ISD::FSUB) continue; - // FNEG use - contractable if all users of the fneg are FSUB. + // FNEG - check if ALL users are FSUB or foldable FPEXT --> FSUB if (Opcode == ISD::FNEG) { - for (const auto *FNegUser : User->users()) { + for (const auto *FNegUser : UserNode->users()) { unsigned FNegUserOp = FNegUser->getOpcode(); - if (FNegUserOp != ISD::FSUB) - return false; + + if (FNegUserOp == ISD::FSUB) { + // FNEG --> FSUB + continue; + } + if (FNegUserOp == ISD::FP_EXTEND) { + // FNEG --> FPEXT --> FSUB + EVT SrcVT = UserNode->getValueType(0); // Src of FPEXT is the FNEG + for (const auto *FNegFPExtUser : FNegUser->users()) { + if (FNegFPExtUser->getOpcode() != ISD::FSUB) + return false; + if (!TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, + FNegFPExtUser->getValueType(0), SrcVT)) + return false; + } + continue; + } + return false; } - continue; + continue; // All FNEG uses are contractable + } + + // FP_EXTEND - check if ALL users are FADD, FSUB, or FNEG --> FSUB + if (Opcode == ISD::FP_EXTEND) { + EVT SrcVT = Mul.getValueType(); + + for (const auto *FPExtUser : UserNode->users()) { + unsigned ExtUserOp = FPExtUser->getOpcode(); + EVT DstVT = FPExtUser->getValueType(0); + if (!TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, DstVT, SrcVT)) + return false; // this FPEXT cannot be folded + + if (ExtUserOp == ISD::FADD || ExtUserOp == ISD::FSUB) { + continue; // FPEXT --> {FADD, FSUB} is contractable + } + if (ExtUserOp == ISD::FNEG) { + // FP_EXTEND --> FNEG --> FSUB + for (const auto *FPExtFNegUser : FPExtUser->users()) { + if (FPExtFNegUser->getOpcode() != ISD::FSUB) { + return false; + } + } + continue; + } + return false; + } + continue; // All FPEXT uses are contractable } // Any other use type is not currently recognized as contractable. @@ -17777,7 +17827,9 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { // Only contract if the multiply has one use or all uses are contractable, // avoiding duplication of the multiply without reducing total operations. if (isContractableFMUL(N0) && - (N0->hasOneUse() || (Aggressive && allMulUsesCanBeContracted(N0)))) { + (N0->hasOneUse() || + (Aggressive && + allMulUsesCanBeContracted(N0, PreferredFusedOpcode, TLI, DAG)))) { return matcher.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), N1); } @@ -17787,7 +17839,9 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { // Only contract if the multiply has one use or all uses are contractable, // avoiding duplication of the multiply without reducing total operations. if (isContractableFMUL(N1) && - (N1->hasOneUse() || (Aggressive && allMulUsesCanBeContracted(N1)))) { + (N1->hasOneUse() || + (Aggressive && + allMulUsesCanBeContracted(N1, PreferredFusedOpcode, TLI, DAG)))) { return matcher.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0), N1.getOperand(1), N0); } @@ -17834,6 +17888,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { if (matcher.match(N0, ISD::FP_EXTEND)) { SDValue N00 = N0.getOperand(0); if (isContractableFMUL(N00) && + allMulUsesCanBeContracted(N00, PreferredFusedOpcode, TLI, DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N00.getValueType())) { return matcher.getNode( @@ -17848,6 +17903,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { if (matcher.match(N1, ISD::FP_EXTEND)) { SDValue N10 = N1.getOperand(0); if (isContractableFMUL(N10) && + allMulUsesCanBeContracted(N10, PreferredFusedOpcode, TLI, DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N10.getValueType())) { return matcher.getNode( @@ -18006,7 +18062,9 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // avoiding duplication of the multiply without reducing total operations. auto tryToFoldXYSubZ = [&](SDValue XY, SDValue Z) { if (isContractableFMUL(XY) && - (XY->hasOneUse() || (Aggressive && allMulUsesCanBeContracted(XY)))) { + (XY->hasOneUse() || + (Aggressive && + allMulUsesCanBeContracted(XY, PreferredFusedOpcode, TLI, DAG)))) { return matcher.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0), XY.getOperand(1), matcher.getNode(ISD::FNEG, SL, VT, Z)); @@ -18020,7 +18078,9 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // avoiding duplication of the multiply without reducing total operations. auto tryToFoldXSubYZ = [&](SDValue X, SDValue YZ) { if (isContractableFMUL(YZ) && - (YZ->hasOneUse() || (Aggressive && allMulUsesCanBeContracted(YZ)))) { + (YZ->hasOneUse() || + (Aggressive && + allMulUsesCanBeContracted(YZ, PreferredFusedOpcode, TLI, DAG)))) { return matcher.getNode( PreferredFusedOpcode, SL, VT, matcher.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)), @@ -18059,7 +18119,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // multiply without reducing total operations. if (matcher.match(N0, ISD::FNEG) && isContractableFMUL(N0.getOperand(0)) && ((N0->hasOneUse() && N0.getOperand(0).hasOneUse()) || - (Aggressive && allMulUsesCanBeContracted(N0.getOperand(0))))) { + (Aggressive && allMulUsesCanBeContracted( + N0.getOperand(0), PreferredFusedOpcode, TLI, DAG)))) { SDValue N00 = N0.getOperand(0).getOperand(0); SDValue N01 = N0.getOperand(0).getOperand(1); return matcher.getNode(PreferredFusedOpcode, SL, VT, @@ -18074,6 +18135,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (matcher.match(N0, ISD::FP_EXTEND)) { SDValue N00 = N0.getOperand(0); if (isContractableFMUL(N00) && + allMulUsesCanBeContracted(N00, PreferredFusedOpcode, TLI, DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N00.getValueType())) { return matcher.getNode( @@ -18090,6 +18152,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (matcher.match(N1, ISD::FP_EXTEND)) { SDValue N10 = N1.getOperand(0); if (isContractableFMUL(N10) && + allMulUsesCanBeContracted(N10, PreferredFusedOpcode, TLI, DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N10.getValueType())) { return matcher.getNode( @@ -18112,6 +18175,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (matcher.match(N00, ISD::FNEG)) { SDValue N000 = N00.getOperand(0); if (isContractableFMUL(N000) && + allMulUsesCanBeContracted(N000, PreferredFusedOpcode, TLI, DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N00.getValueType())) { return matcher.getNode( @@ -18136,6 +18200,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (matcher.match(N00, ISD::FP_EXTEND)) { SDValue N000 = N00.getOperand(0); if (isContractableFMUL(N000) && + allMulUsesCanBeContracted(N000, PreferredFusedOpcode, TLI, DAG) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N000.getValueType())) { return matcher.getNode( diff --git a/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll b/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll index 79ce1813cb677..4fa74d8c9669d 100644 --- a/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-multiple-uses-contraction.ll @@ -1232,12 +1232,6 @@ define { float, float } @mul_fsub_and_fneg_fsub_contractable(float %a, float %b, ; FPEXT patterns ; Tests for allMulUsesCanBeContracted with fpext(fmul) feeding into ; fadd, fsub, and fneg combinations. -; -; NOTE: The allMulUsesCanBeContracted guard does not yet recognize fpext -; users of the multiply. That support is added by later patches in the -; series. Until then, the CHECK lines below reflect current (potentially -; over-conservative) codegen and may not match the "Expected:" comments on -; individual tests. ; ========================================================================== ; Test case: fpext(fmul) -> {fadd, fadd} (chained adds, second uses result of first). @@ -1365,77 +1359,41 @@ define { float, float } @fpext_noncontractable(float %x, float %y, half %u, half ; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v1, v4 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_noncontractable: -; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry -; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v0, v2, v3 -; GFX9_4-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_noncontractable: -; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry -; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v0, v2, v3 -; GFX9_4-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_noncontractable: -; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry -; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12_5-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31] -; -; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_noncontractable: -; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry -; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12_5-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31] -; -; GFX9_4-SDAG-F32DENORM-LABEL: fpext_noncontractable: -; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry -; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3 -; GFX9_4-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0] -; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9_4-SDAG-LABEL: fpext_noncontractable: +; GFX9_4-SDAG: ; %bb.0: ; %entry +; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-SDAG-NEXT: v_mul_f16_e32 v0, v2, v3 +; GFX9_4-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9_4-SDAG-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0] +; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX9_4-GISEL-F32DENORM-LABEL: fpext_noncontractable: -; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry -; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3 -; GFX9_4-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0] -; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9_4-GISEL-LABEL: fpext_noncontractable: +; GFX9_4-GISEL: ; %bb.0: ; %entry +; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-GISEL-NEXT: v_mul_f16_e32 v0, v2, v3 +; GFX9_4-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9_4-GISEL-NEXT: v_fma_mix_f32 v0, v0, 1.0, v4 op_sel_hi:[1,1,0] +; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX12_5-SDAG-F32DENORM-LABEL: fpext_noncontractable: -; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry -; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0] -; GFX12_5-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31] +; GFX12_5-SDAG-LABEL: fpext_noncontractable: +; GFX12_5-SDAG: ; %bb.0: ; %entry +; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12_5-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12_5-SDAG-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0] +; GFX12_5-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31] ; -; GFX12_5-GISEL-F32DENORM-LABEL: fpext_noncontractable: -; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry -; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0] -; GFX12_5-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31] +; GFX12_5-GISEL-LABEL: fpext_noncontractable: +; GFX12_5-GISEL: ; %bb.0: ; %entry +; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12_5-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12_5-GISEL-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0] +; GFX12_5-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31] entry: %mul = fmul contract half %u, %v %mul.ext = fpext contract half %mul to float @@ -1467,67 +1425,37 @@ define { float, half } @fpext_noncontractable_2(float %x, float %y, half %u, hal ; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_noncontractable_2: -; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry -; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_noncontractable_2: -; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry -; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_noncontractable_2: -; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry -; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX12_5-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31] -; -; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_noncontractable_2: -; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry -; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX12_5-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31] -; -; GFX9_4-SDAG-F32DENORM-LABEL: fpext_noncontractable_2: -; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry -; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0] -; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9_4-SDAG-LABEL: fpext_noncontractable_2: +; GFX9_4-SDAG: ; %bb.0: ; %entry +; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX9_4-SDAG-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0] +; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX9_4-GISEL-F32DENORM-LABEL: fpext_noncontractable_2: -; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry -; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0] -; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9_4-GISEL-LABEL: fpext_noncontractable_2: +; GFX9_4-GISEL: ; %bb.0: ; %entry +; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX9_4-GISEL-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0] +; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX12_5-SDAG-F32DENORM-LABEL: fpext_noncontractable_2: -; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry -; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0] -; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31] +; GFX12_5-SDAG-LABEL: fpext_noncontractable_2: +; GFX12_5-SDAG: ; %bb.0: ; %entry +; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12_5-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12_5-SDAG-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0] +; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31] ; -; GFX12_5-GISEL-F32DENORM-LABEL: fpext_noncontractable_2: -; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry -; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0] -; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31] +; GFX12_5-GISEL-LABEL: fpext_noncontractable_2: +; GFX12_5-GISEL: ; %bb.0: ; %entry +; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12_5-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12_5-GISEL-NEXT: v_fma_mix_f32 v0, v1, 1.0, v4 op_sel_hi:[1,1,0] +; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31] entry: %mul = fmul contract half %u, %v %mul.ext = fpext contract half %mul to float @@ -1857,77 +1785,41 @@ define { float, float } @fpext_noncontractable_sub(float %x, float %y, half %u, ; GFX9-GISEL-NEXT: v_sub_f32_e32 v0, v1, v4 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_noncontractable_sub: -; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry -; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v0, v2, v3 -; GFX9_4-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0] -; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_noncontractable_sub: -; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry -; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v0, v2, v3 -; GFX9_4-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0] -; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_noncontractable_sub: -; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry -; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0] -; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12_5-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31] -; -; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_noncontractable_sub: -; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry -; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0] -; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12_5-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31] -; -; GFX9_4-SDAG-F32DENORM-LABEL: fpext_noncontractable_sub: -; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry -; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3 -; GFX9_4-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1] -; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9_4-SDAG-LABEL: fpext_noncontractable_sub: +; GFX9_4-SDAG: ; %bb.0: ; %entry +; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-SDAG-NEXT: v_mul_f16_e32 v0, v2, v3 +; GFX9_4-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9_4-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1] +; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX9_4-GISEL-F32DENORM-LABEL: fpext_noncontractable_sub: -; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry -; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3 -; GFX9_4-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1] -; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9_4-GISEL-LABEL: fpext_noncontractable_sub: +; GFX9_4-GISEL: ; %bb.0: ; %entry +; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-GISEL-NEXT: v_mul_f16_e32 v0, v2, v3 +; GFX9_4-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9_4-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, v0 op_sel_hi:[0,1,1] +; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX12_5-SDAG-F32DENORM-LABEL: fpext_noncontractable_sub: -; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry -; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1] -; GFX12_5-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31] +; GFX12_5-SDAG-LABEL: fpext_noncontractable_sub: +; GFX12_5-SDAG: ; %bb.0: ; %entry +; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12_5-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12_5-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1] +; GFX12_5-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31] ; -; GFX12_5-GISEL-F32DENORM-LABEL: fpext_noncontractable_sub: -; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry -; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1] -; GFX12_5-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31] +; GFX12_5-GISEL-LABEL: fpext_noncontractable_sub: +; GFX12_5-GISEL: ; %bb.0: ; %entry +; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12_5-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12_5-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1] +; GFX12_5-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31] entry: %mul = fmul contract half %u, %v %mul.ext = fpext contract half %mul to float @@ -1959,67 +1851,37 @@ define { float, half } @fpext_noncontractable_sub_2(float %x, float %y, half %u, ; GFX9-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_noncontractable_sub_2: -; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry -; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0] -; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_noncontractable_sub_2: -; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry -; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0] -; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_noncontractable_sub_2: -; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry -; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0] -; GFX12_5-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31] -; -; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_noncontractable_sub_2: -; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry -; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, -v4 op_sel_hi:[1,1,0] -; GFX12_5-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31] -; -; GFX9_4-SDAG-F32DENORM-LABEL: fpext_noncontractable_sub_2: -; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry -; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1] -; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9_4-SDAG-LABEL: fpext_noncontractable_sub_2: +; GFX9_4-SDAG: ; %bb.0: ; %entry +; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX9_4-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1] +; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX9_4-GISEL-F32DENORM-LABEL: fpext_noncontractable_sub_2: -; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry -; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1] -; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9_4-GISEL-LABEL: fpext_noncontractable_sub_2: +; GFX9_4-GISEL: ; %bb.0: ; %entry +; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX9_4-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1] +; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX12_5-SDAG-F32DENORM-LABEL: fpext_noncontractable_sub_2: -; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry -; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1] -; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31] +; GFX12_5-SDAG-LABEL: fpext_noncontractable_sub_2: +; GFX12_5-SDAG: ; %bb.0: ; %entry +; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12_5-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12_5-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1] +; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31] ; -; GFX12_5-GISEL-F32DENORM-LABEL: fpext_noncontractable_sub_2: -; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry -; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1] -; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31] +; GFX12_5-GISEL-LABEL: fpext_noncontractable_sub_2: +; GFX12_5-GISEL: ; %bb.0: ; %entry +; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12_5-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12_5-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, v1 op_sel_hi:[0,1,1] +; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31] entry: %mul = fmul contract half %u, %v %mul.ext = fpext contract half %mul to float @@ -2365,73 +2227,37 @@ define {float, half} @fpext_fneg_fpext_fsub_noncontractable(float %x, float %y, ; GFX9-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_noncontractable: -; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry -; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX9_4-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX9_4-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_noncontractable: -; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry -; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX9_4-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX9_4-GISEL-F32FLUSH-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_noncontractable: -; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry -; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX12_5-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12_5-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31] -; -; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_fneg_fpext_fsub_noncontractable: -; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry -; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX12_5-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12_5-GISEL-F32FLUSH-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31] -; -; GFX9_4-SDAG-F32DENORM-LABEL: fpext_fneg_fpext_fsub_noncontractable: -; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry -; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1] -; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9_4-SDAG-LABEL: fpext_fneg_fpext_fsub_noncontractable: +; GFX9_4-SDAG: ; %bb.0: ; %entry +; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX9_4-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1] +; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX9_4-GISEL-F32DENORM-LABEL: fpext_fneg_fpext_fsub_noncontractable: -; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry -; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1] -; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9_4-GISEL-LABEL: fpext_fneg_fpext_fsub_noncontractable: +; GFX9_4-GISEL: ; %bb.0: ; %entry +; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX9_4-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1] +; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX12_5-SDAG-F32DENORM-LABEL: fpext_fneg_fpext_fsub_noncontractable: -; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry -; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1] -; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31] +; GFX12_5-SDAG-LABEL: fpext_fneg_fpext_fsub_noncontractable: +; GFX12_5-SDAG: ; %bb.0: ; %entry +; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12_5-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12_5-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1] +; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31] ; -; GFX12_5-GISEL-F32DENORM-LABEL: fpext_fneg_fpext_fsub_noncontractable: -; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry -; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1] -; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31] +; GFX12_5-GISEL-LABEL: fpext_fneg_fpext_fsub_noncontractable: +; GFX12_5-GISEL: ; %bb.0: ; %entry +; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12_5-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12_5-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1] +; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31] entry: %mul = fmul contract half %u, %v %neg = fneg contract half %mul @@ -2578,83 +2404,41 @@ define {float, float} @fpext_fneg_fsub_noncontractable(float %x, float %y, half ; GFX9-GISEL-NEXT: v_sub_f32_e64 v0, -v1, v4 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX9_4-SDAG-F32FLUSH-LABEL: fpext_fneg_fsub_noncontractable: -; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry -; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v0, v2, v3 -; GFX9_4-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX9_4-SDAG-F32FLUSH-NEXT: s_nop 0 -; GFX9_4-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX9_4-GISEL-F32FLUSH-LABEL: fpext_fneg_fsub_noncontractable: -; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry -; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v0, v2, v3 -; GFX9_4-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX9_4-GISEL-F32FLUSH-NEXT: s_nop 0 -; GFX9_4-GISEL-F32FLUSH-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31] -; -; GFX12_5-SDAG-F32FLUSH-LABEL: fpext_fneg_fsub_noncontractable: -; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry -; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX12_5-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12_5-SDAG-F32FLUSH-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX12_5-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31] -; -; GFX12_5-GISEL-F32FLUSH-LABEL: fpext_fneg_fsub_noncontractable: -; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry -; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX12_5-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12_5-GISEL-F32FLUSH-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX12_5-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31] -; -; GFX9_4-SDAG-F32DENORM-LABEL: fpext_fneg_fsub_noncontractable: -; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry -; GFX9_4-SDAG-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3 -; GFX9_4-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v0 op_sel_hi:[0,1,1] -; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9_4-SDAG-LABEL: fpext_fneg_fsub_noncontractable: +; GFX9_4-SDAG: ; %bb.0: ; %entry +; GFX9_4-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-SDAG-NEXT: v_mul_f16_e32 v0, v2, v3 +; GFX9_4-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9_4-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v0 op_sel_hi:[0,1,1] +; GFX9_4-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX9_4-GISEL-F32DENORM-LABEL: fpext_fneg_fsub_noncontractable: -; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry -; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v0, v2, v3 -; GFX9_4-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v0 op_sel_hi:[0,1,1] -; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9_4-GISEL-LABEL: fpext_fneg_fsub_noncontractable: +; GFX9_4-GISEL: ; %bb.0: ; %entry +; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-GISEL-NEXT: v_mul_f16_e32 v0, v2, v3 +; GFX9_4-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9_4-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v0 op_sel_hi:[0,1,1] +; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX12_5-SDAG-F32DENORM-LABEL: fpext_fneg_fsub_noncontractable: -; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry -; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-SDAG-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-SDAG-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1] -; GFX12_5-SDAG-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31] +; GFX12_5-SDAG-LABEL: fpext_fneg_fsub_noncontractable: +; GFX12_5-SDAG: ; %bb.0: ; %entry +; GFX12_5-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12_5-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12_5-SDAG-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX12_5-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12_5-SDAG-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1] +; GFX12_5-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX12_5-SDAG-NEXT: s_set_pc_i64 s[30:31] ; -; GFX12_5-GISEL-F32DENORM-LABEL: fpext_fneg_fsub_noncontractable: -; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry -; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1] -; GFX12_5-GISEL-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31] +; GFX12_5-GISEL-LABEL: fpext_fneg_fsub_noncontractable: +; GFX12_5-GISEL: ; %bb.0: ; %entry +; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12_5-GISEL-NEXT: v_mul_f16_e32 v1, v2, v3 +; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12_5-GISEL-NEXT: v_fma_mix_f32 v0, v4, -1.0, -v1 op_sel_hi:[0,1,1] +; GFX12_5-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31] entry: %mul = fmul contract half %u, %v %mul.ext = fpext contract half %mul to float @@ -2955,26 +2739,26 @@ define {float, float, float} @fma_chain_fpext_noncontractable(float %x, float %y ; GFX9_4-SDAG-F32FLUSH-LABEL: fma_chain_fpext_noncontractable: ; GFX9_4-SDAG-F32FLUSH: ; %bb.0: ; %entry ; GFX9_4-SDAG-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v6, v2, v3 -; GFX9_4-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0] +; GFX9_4-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v7, v2, v3 +; GFX9_4-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v6, v7 +; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0] ; GFX9_4-SDAG-F32FLUSH-NEXT: s_nop 0 -; GFX9_4-SDAG-F32FLUSH-NEXT: v_fmac_f32_e32 v4, v0, v1 -; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v5 op_sel_hi:[1,1,0] -; GFX9_4-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4 +; GFX9_4-SDAG-F32FLUSH-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX9_4-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, v7, 1.0, v5 op_sel_hi:[1,1,0] +; GFX9_4-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 ; GFX9_4-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v2, v6 ; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX9_4-GISEL-F32FLUSH-LABEL: fma_chain_fpext_noncontractable: ; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry ; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v6, v2, v3 -; GFX9_4-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0] +; GFX9_4-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v7, v2, v3 +; GFX9_4-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v6, v7 +; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0] ; GFX9_4-GISEL-F32FLUSH-NEXT: s_nop 0 -; GFX9_4-GISEL-F32FLUSH-NEXT: v_fmac_f32_e32 v4, v0, v1 -; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, v5 op_sel_hi:[1,1,0] -; GFX9_4-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4 +; GFX9_4-GISEL-F32FLUSH-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v7, 1.0, v5 op_sel_hi:[1,1,0] +; GFX9_4-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2 ; GFX9_4-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v2, v6 ; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -2983,13 +2767,13 @@ define {float, float, float} @fma_chain_fpext_noncontractable(float %x, float %y ; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12_5-SDAG-F32FLUSH-NEXT: s_wait_kmcnt 0x0 ; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX12_5-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v6, v2, v3 -; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v3, v2, v3, v5 op_sel_hi:[1,1,0] -; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12_5-SDAG-F32FLUSH-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12_5-SDAG-F32FLUSH-NEXT: v_fmac_f32_e32 v4, v0, v1 -; GFX12_5-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v6 -; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12_5-SDAG-F32FLUSH-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v0, v4 +; GFX12_5-SDAG-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, 1.0, v5 op_sel_hi:[1,1,0] +; GFX12_5-SDAG-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX12_5-SDAG-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12_5-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4 ; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31] ; ; GFX12_5-GISEL-F32FLUSH-LABEL: fma_chain_fpext_noncontractable: @@ -2997,13 +2781,13 @@ define {float, float, float} @fma_chain_fpext_noncontractable(float %x, float %y ; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0 ; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v4, v2, v3, v4 op_sel_hi:[1,1,0] -; GFX12_5-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v6, v2, v3 -; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v3, v2, v3, v5 op_sel_hi:[1,1,0] -; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12_5-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12_5-GISEL-F32FLUSH-NEXT: v_fmac_f32_e32 v4, v0, v1 -; GFX12_5-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v6 -; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12_5-GISEL-F32FLUSH-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v0, v4 +; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, 1.0, v5 op_sel_hi:[1,1,0] +; GFX12_5-GISEL-F32FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12_5-GISEL-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4 ; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31] ; ; GFX9_4-SDAG-F32DENORM-LABEL: fma_chain_fpext_noncontractable: @@ -3097,14 +2881,14 @@ define {float, float} @fma_chain_fpext_fsub_contractable(float %x, float %y, hal ; GFX9_4-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4 ; GFX9_4-SDAG-F32FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX9_4-GISEL-F32FLUSH-LABEL: fma_chain_fpext_fsub_contractable: -; GFX9_4-GISEL-F32FLUSH: ; %bb.0: ; %entry -; GFX9_4-GISEL-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v6, v2, v3 -; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v0, v1, v6 op_sel_hi:[0,0,1] -; GFX9_4-GISEL-F32FLUSH-NEXT: v_sub_f32_e32 v0, v0, v4 -; GFX9_4-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, -v5 op_sel_hi:[1,1,0] -; GFX9_4-GISEL-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; GFX9_4-GISEL-LABEL: fma_chain_fpext_fsub_contractable: +; GFX9_4-GISEL: ; %bb.0: ; %entry +; GFX9_4-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9_4-GISEL-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX9_4-GISEL-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1] +; GFX9_4-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 +; GFX9_4-GISEL-NEXT: v_fma_mix_f32 v1, v5, -1.0, v2 op_sel_hi:[0,1,1] +; GFX9_4-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12_5-SDAG-F32FLUSH-LABEL: fma_chain_fpext_fsub_contractable: ; GFX12_5-SDAG-F32FLUSH: ; %bb.0: ; %entry @@ -3117,16 +2901,16 @@ define {float, float} @fma_chain_fpext_fsub_contractable(float %x, float %y, hal ; GFX12_5-SDAG-F32FLUSH-NEXT: v_mov_b32_e32 v0, v4 ; GFX12_5-SDAG-F32FLUSH-NEXT: s_set_pc_i64 s[30:31] ; -; GFX12_5-GISEL-F32FLUSH-LABEL: fma_chain_fpext_fsub_contractable: -; GFX12_5-GISEL-F32FLUSH: ; %bb.0: ; %entry -; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-GISEL-F32FLUSH-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-GISEL-F32FLUSH-NEXT: v_mul_f16_e32 v6, v2, v3 -; GFX12_5-GISEL-F32FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v0, v0, v1, v6 op_sel_hi:[0,0,1] -; GFX12_5-GISEL-F32FLUSH-NEXT: v_fma_mix_f32 v1, v2, v3, -v5 op_sel_hi:[1,1,0] -; GFX12_5-GISEL-F32FLUSH-NEXT: v_sub_f32_e32 v0, v0, v4 -; GFX12_5-GISEL-F32FLUSH-NEXT: s_set_pc_i64 s[30:31] +; GFX12_5-GISEL-LABEL: fma_chain_fpext_fsub_contractable: +; GFX12_5-GISEL: ; %bb.0: ; %entry +; GFX12_5-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12_5-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12_5-GISEL-NEXT: v_mul_f16_e32 v2, v2, v3 +; GFX12_5-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12_5-GISEL-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1] +; GFX12_5-GISEL-NEXT: v_fma_mix_f32 v1, v5, -1.0, v2 op_sel_hi:[0,1,1] +; GFX12_5-GISEL-NEXT: v_sub_f32_e32 v0, v0, v4 +; GFX12_5-GISEL-NEXT: s_set_pc_i64 s[30:31] ; ; GFX9_4-SDAG-F32DENORM-LABEL: fma_chain_fpext_fsub_contractable: ; GFX9_4-SDAG-F32DENORM: ; %bb.0: ; %entry @@ -3137,15 +2921,6 @@ define {float, float} @fma_chain_fpext_fsub_contractable(float %x, float %y, hal ; GFX9_4-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v5, -1.0, v2 op_sel_hi:[0,1,1] ; GFX9_4-SDAG-F32DENORM-NEXT: s_setpc_b64 s[30:31] ; -; GFX9_4-GISEL-F32DENORM-LABEL: fma_chain_fpext_fsub_contractable: -; GFX9_4-GISEL-F32DENORM: ; %bb.0: ; %entry -; GFX9_4-GISEL-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9_4-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3 -; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1] -; GFX9_4-GISEL-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v4 -; GFX9_4-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v5, -1.0, v2 op_sel_hi:[0,1,1] -; GFX9_4-GISEL-F32DENORM-NEXT: s_setpc_b64 s[30:31] -; ; GFX12_5-SDAG-F32DENORM-LABEL: fma_chain_fpext_fsub_contractable: ; GFX12_5-SDAG-F32DENORM: ; %bb.0: ; %entry ; GFX12_5-SDAG-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3156,17 +2931,6 @@ define {float, float} @fma_chain_fpext_fsub_contractable(float %x, float %y, hal ; GFX12_5-SDAG-F32DENORM-NEXT: v_fma_mix_f32 v1, v5, -1.0, v2 op_sel_hi:[0,1,1] ; GFX12_5-SDAG-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v4 ; GFX12_5-SDAG-F32DENORM-NEXT: s_set_pc_i64 s[30:31] -; -; GFX12_5-GISEL-F32DENORM-LABEL: fma_chain_fpext_fsub_contractable: -; GFX12_5-GISEL-F32DENORM: ; %bb.0: ; %entry -; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12_5-GISEL-F32DENORM-NEXT: s_wait_kmcnt 0x0 -; GFX12_5-GISEL-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3 -; GFX12_5-GISEL-F32DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1] -; GFX12_5-GISEL-F32DENORM-NEXT: v_fma_mix_f32 v1, v5, -1.0, v2 op_sel_hi:[0,1,1] -; GFX12_5-GISEL-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v4 -; GFX12_5-GISEL-F32DENORM-NEXT: s_set_pc_i64 s[30:31] entry: %mul = fmul contract reassoc half %u, %v %mul.ext = fpext contract reassoc half %mul to float _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
