llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Frederik Harwath (frederik-h) <details> <summary>Changes</summary> Change si-peephole-sdwa to eliminate V_PACK_B32_F16_e64 instructions by changing the second operand to write to the upper word of the destination directly. --- Patch is 254.19 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/176383.diff 36 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+46-12) - (modified) llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp (+35) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll (+48-64) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll (+24-32) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll (+40-40) - (modified) llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll (+5-6) - (modified) llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll (+18-20) - (modified) llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll (+28-43) - (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll (+3-4) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+19-31) - (modified) llvm/test/CodeGen/AMDGPU/fdiv.f16.ll (+12-12) - (modified) llvm/test/CodeGen/AMDGPU/fmaximum3.ll (+3-4) - (modified) llvm/test/CodeGen/AMDGPU/fminimum3.ll (+3-4) - (modified) llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll (+126-140) - (modified) llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll (+29-34) - (modified) llvm/test/CodeGen/AMDGPU/fpow.ll (+92-104) - (modified) llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll (+7-12) - (modified) llvm/test/CodeGen/AMDGPU/fract-match.ll (+4-5) - (modified) llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll (+29-42) - (modified) llvm/test/CodeGen/AMDGPU/llvm.exp.ll (+68-60) - (modified) llvm/test/CodeGen/AMDGPU/llvm.exp10.ll (+87-82) - (modified) llvm/test/CodeGen/AMDGPU/llvm.exp2.ll (+21-21) - (modified) llvm/test/CodeGen/AMDGPU/llvm.frexp.ll (+13-22) - (modified) llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll (+28-28) - (modified) llvm/test/CodeGen/AMDGPU/llvm.log.ll (+138-98) - (modified) llvm/test/CodeGen/AMDGPU/llvm.log10.ll (+138-98) - (modified) llvm/test/CodeGen/AMDGPU/llvm.log2.ll (+68-127) - (modified) llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll (+2-3) - (modified) llvm/test/CodeGen/AMDGPU/llvm.round.ll (+7-8) - (modified) llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll (+29-42) - (modified) llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll (+74-118) - (modified) llvm/test/CodeGen/AMDGPU/repeated-divisor.ll (+16-19) - (modified) llvm/test/CodeGen/AMDGPU/roundeven.ll (+42-44) - (modified) llvm/test/CodeGen/AMDGPU/sdwa-commute.ll (+2-3) - (modified) llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll (+25-30) - (modified) llvm/test/CodeGen/AMDGPU/v_pack.ll (+18-20) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0788cbb18269b..5b2e41ef5f7e8 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -655,6 +655,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, break; case ISD::EXTRACT_SUBVECTOR: case ISD::CONCAT_VECTORS: + case ISD::FSIN: + case ISD::FCOS: setOperationAction(Op, VT, Custom); break; default: @@ -9876,6 +9878,35 @@ SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op, DAG.getValueType(SmallVT)); } +/// Helper function for LowerINTRINSIC_WO_CHAIN. Replace a \p Op of +/// scalar type with a new node \p NewISD node with one argument which +/// is the operand at index \p OperandIndex of Op. Scalarizes for +/// vector types. +/// +// FIXME The manual scalarization seems to be necessary because the +// Expand fallback is not supported for ISD::INTRINSIC_WO_CHAIN and +// hence the lowering function should not fail for v2f16; see comment +// in SelectionDAGLegalize::ExpandNode. +static SDValue BuildScalarizedUnaryOp(SDValue Op, unsigned NewISD, + unsigned OperandIndex, + SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + SDLoc DL(Op); + SDValue Operand = Op.getOperand(OperandIndex); + if (!VT.isVector()) + return DAG.getNode(NewISD, DL, VT, Operand); + + EVT ScalarVT = VT.getScalarType(); + unsigned NElts = VT.getVectorNumElements(); + SmallVector<SDValue, 8> Args; + + DAG.ExtractVectorElements(Operand, Args, 0, NElts); + for (unsigned I = 0; I < NElts; ++I) + Args[I] = DAG.getNode(NewISD, DL, ScalarVT, Args[I]); + + return DAG.getBuildVector(VT, DL, Args); +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -10098,10 +10129,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); case Intrinsic::amdgcn_sin: - return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); + return BuildScalarizedUnaryOp(Op, AMDGPUISD::SIN_HW, 1, DAG); case Intrinsic::amdgcn_cos: - return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1)); + return BuildScalarizedUnaryOp(Op, AMDGPUISD::COS_HW, 1, DAG); case Intrinsic::amdgcn_mul_u24: return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), @@ -10117,7 +10148,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return emitRemovedIntrinsicError(DAG, DL, VT); } case Intrinsic::amdgcn_fract: - return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); + return BuildScalarizedUnaryOp(Op, AMDGPUISD::FRACT, 1, DAG); case Intrinsic::amdgcn_class: return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1), @@ -12965,6 +12996,9 @@ SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const { } SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { + unsigned OpC = Op.getOpcode(); + assert((OpC == ISD::FCOS || OpC == ISD::FSIN) && "Wrong trig opcode"); + SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue Arg = Op.getOperand(0); @@ -12978,19 +13012,19 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { if (Subtarget->hasTrigReducedRange()) { SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags); - TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags); + SDValue FractId = + DAG.getTargetConstant(Intrinsic::amdgcn_fract, DL, MVT::i32); + TrigVal = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, FractId, MulVal, Flags); } else { TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags); } - switch (Op.getOpcode()) { - case ISD::FCOS: - return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags); - case ISD::FSIN: - return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags); - default: - llvm_unreachable("Wrong trig opcode"); - } + Intrinsic::AMDGCNIntrinsics Intrinsic = + OpC == ISD::FSIN ? Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; + SDValue TrigId = DAG.getTargetConstant(Intrinsic, DL, MVT::i32); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(Op), VT, TrigId, TrigVal, + Flags); } SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index acc4b3f0a68b4..232d975c3fc4e 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -455,6 +455,23 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { // writing WORD_1. Modifiers don't matter because all the bits that // would be impacted are being overwritten by the dst. // Any other case will not work. + // + // FIXME Is this really true for f16 operands? That is, this + // change introduced by the v_pack_b32_f16 conversion looks wrong: + //@@ -2394,17 +2394,17 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> + //%a) { + // ; GFX9-LABEL: v_neg_rsq_v2f16: + // ; GFX9: ; %bb.0: + // ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + // -; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD + // dst_unused:UNUSED_PAD src0_sel:WORD_1 + // -; GFX9-NEXT: v_rsq_f16_e32 v0, v0 + // -; GFX9-NEXT: v_pack_b32_f16 v0, -v0, -v1 + // +; GFX9-NEXT: v_rsq_f16_e32 v1, v0 + // +; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:WORD_1 + // dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 + // +; GFX9-NEXT: v_mov_b32_e32 v0, v1 + // ; GFX9-NEXT: s_setpc_b64 s[30:31] SdwaSel DstSel = static_cast<SdwaSel>( TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel)); if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && @@ -961,7 +978,25 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { return std::make_unique<SDWADstPreserveOperand>( OrDst, OrSDWADef, OrOtherDef, DstSel); + } + case AMDGPU::V_PACK_B32_F16_e64: { + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + + bool InvalidOp = false; + for (auto *Op : {Dst, Src1, Src2}) + if (!Op || !Op->isReg() || Op->getReg().isPhysical()) + InvalidOp = true; + + if (InvalidOp) + break; + + if (isSameReg(*Src1, *Src2)) + break; + // FIXME Figure out necessary restrictions on Src1 and Src2 + return std::make_unique<SDWADstPreserveOperand>(Dst, Src1, Src2, WORD_1); } } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll index d046b854fb0d8..9b4b14e6ca105 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll @@ -543,14 +543,12 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> % ; GFX9-LABEL: test_v4f16_sub_mul: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX9-NEXT: v_pk_mul_f16 v2, v0, v2 +; GFX9-NEXT: v_pk_mul_f16 v3, v1, v3 +; GFX9-NEXT: v_sub_f16_e32 v0, v2, v4 +; GFX9-NEXT: v_sub_f16_e32 v1, v3, v5 +; GFX9-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul: @@ -563,27 +561,23 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> % ; GFX9-DENORM-LABEL: test_v4f16_sub_mul: ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX9-DENORM-NEXT: v_pk_mul_f16 v2, v0, v2 +; GFX9-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3 +; GFX9-DENORM-NEXT: v_sub_f16_e32 v0, v2, v4 +; GFX9-DENORM-NEXT: v_sub_f16_e32 v1, v3, v5 +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_v4f16_sub_mul: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX10-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX10-NEXT: v_pk_mul_f16 v2, v0, v2 +; GFX10-NEXT: v_pk_mul_f16 v3, v1, v3 +; GFX10-NEXT: v_sub_f16_e32 v0, v2, v4 +; GFX10-NEXT: v_sub_f16_e32 v1, v3, v5 +; GFX10-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul: @@ -596,14 +590,12 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> % ; GFX10-DENORM-LABEL: test_v4f16_sub_mul: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX10-DENORM-NEXT: v_pk_mul_f16 v2, v0, v2 +; GFX10-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3 +; GFX10-DENORM-NEXT: v_sub_f16_e32 v0, v2, v4 +; GFX10-DENORM-NEXT: v_sub_f16_e32 v1, v3, v5 +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul: @@ -642,14 +634,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal ; GFX9-LABEL: test_v4f16_sub_mul_rhs: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-NEXT: v_sub_f16_e32 v2, v4, v0 -; GFX9-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_sub_f16_e32 v3, v5, v1 -; GFX9-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX9-NEXT: v_pk_mul_f16 v2, v0, v2 +; GFX9-NEXT: v_pk_mul_f16 v3, v1, v3 +; GFX9-NEXT: v_sub_f16_e32 v0, v4, v2 +; GFX9-NEXT: v_sub_f16_e32 v1, v5, v3 +; GFX9-NEXT: v_sub_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_sub_f16_sdwa v1, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul_rhs: @@ -662,27 +652,23 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal ; GFX9-DENORM-LABEL: test_v4f16_sub_mul_rhs: ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v4, v0 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v5, v1 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX9-DENORM-NEXT: v_pk_mul_f16 v2, v0, v2 +; GFX9-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3 +; GFX9-DENORM-NEXT: v_sub_f16_e32 v0, v4, v2 +; GFX9-DENORM-NEXT: v_sub_f16_e32 v1, v5, v3 +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_v4f16_sub_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX10-NEXT: v_sub_f16_e32 v2, v4, v0 -; GFX10-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_sub_f16_e32 v3, v5, v1 -; GFX10-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX10-NEXT: v_pk_mul_f16 v2, v0, v2 +; GFX10-NEXT: v_pk_mul_f16 v3, v1, v3 +; GFX10-NEXT: v_sub_f16_e32 v0, v4, v2 +; GFX10-NEXT: v_sub_f16_e32 v1, v5, v3 +; GFX10-NEXT: v_sub_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_sdwa v1, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs: @@ -695,14 +681,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal ; GFX10-DENORM-LABEL: test_v4f16_sub_mul_rhs: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v4, v0 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v5, v1 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX10-DENORM-NEXT: v_pk_mul_f16 v2, v0, v2 +; GFX10-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3 +; GFX10-DENORM-NEXT: v_sub_f16_e32 v0, v4, v2 +; GFX10-DENORM-NEXT: v_sub_f16_e32 v1, v5, v3 +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul_rhs: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll index c0a828ecacbae..6143e91f037df 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll @@ -219,14 +219,12 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x ; GFX9-LABEL: test_v4f16_sub_ext_neg_mul: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX9-NEXT: v_pk_mul_f16 v2, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: v_pk_mul_f16 v3, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: v_sub_f16_e32 v0, v2, v4 +; GFX9-NEXT: v_sub_f16_e32 v1, v3, v5 +; GFX9-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul: @@ -239,27 +237,23 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x ; GFX9-DENORM-LABEL: test_v4f16_sub_ext_neg_mul: ; GFX9-DENORM: ; %bb.0: ; %entry ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0 -; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX9-DENORM-NEXT: v_pk_mul_f16 v2, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-DENORM-NEXT: v_sub_f16_e32 v0, v2, v4 +; GFX9-DENORM-NEXT: v_sub_f16_e32 v1, v3, v5 +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_v4f16_sub_ext_neg_mul: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] -; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/176383 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
