https://github.com/petar-avramovic created https://github.com/llvm/llvm-project/pull/179225
Select VOP2 version when there are no src_modifers, otherwise VOP3. >From 8228474f52a1de46ccf4aa65eb3783dd6b14785c Mon Sep 17 00:00:00 2001 From: Petar Avramovic <[email protected]> Date: Mon, 2 Feb 2026 13:05:03 +0100 Subject: [PATCH] AMDGPU: Improve codegen for VOP2 v_dot2c_f32_f16/bf16 Select VOP2 version when there are no src_modifers, otherwise VOP3. --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 8 + llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 22 +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 2 + .../AMDGPU/AMDGPUInstructionSelector.cpp | 56 ++++-- .../Target/AMDGPU/AMDGPUInstructionSelector.h | 5 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 + llvm/lib/Target/AMDGPU/VOP2Instructions.td | 10 +- .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll | 34 ++-- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll | 160 +++++------------- 9 files changed, 145 insertions(+), 154 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 7a854d7acf84a..fcfd07cc1d0e2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -51,10 +51,18 @@ def gi_vop3pmodsdot : GIComplexOperandMatcher<s32, "selectVOP3PModsDOT">, GIComplexPatternEquiv<VOP3PModsDOT>; +def gi_vop3pnomodsdot : + GIComplexOperandMatcher<s32, "selectVOP3PNoModsDOT">, + GIComplexPatternEquiv<VOP3PNoModsDOT>; + def gi_vop3pmodsf32 : GIComplexOperandMatcher<s32, "selectVOP3PModsF32">, GIComplexPatternEquiv<VOP3PModsF32>; +def gi_vop3pnomodsf32 : + GIComplexOperandMatcher<s32, "selectVOP3PNoModsF32">, + GIComplexPatternEquiv<VOP3PNoModsF32>; + def gi_wmmaopselvop3pmods : GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">, GIComplexPatternEquiv<WMMAOpSelVOP3PMods>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 4fdf222abb017..b9694273476f7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3658,6 +3658,17 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src, return SelectVOP3PMods(In, Src, SrcMods, true); } +bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsDOT(SDValue In, SDValue &Src) const { + SDValue SrcTmp, SrcModsTmp; + SelectVOP3PMods(In, SrcTmp, SrcModsTmp, true); + if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) { + Src = SrcTmp; + return true; + } + + return false; +} + bool AMDGPUDAGToDAGISel::SelectVOP3PModsF32(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = SISrcMods::OP_SEL_1; @@ -3674,6 +3685,17 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsF32(SDValue In, SDValue &Src, return true; } +bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsF32(SDValue In, SDValue &Src) const { + SDValue SrcTmp, SrcModsTmp; + SelectVOP3PModsF32(In, SrcTmp, SrcModsTmp); + if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) { + Src = SrcTmp; + return true; + } + + return false; +} + bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const { const ConstantSDNode *C = cast<ConstantSDNode>(In); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 43550c7ab53f8..5c13072005a3c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -233,7 +233,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods, bool IsDOT = false) const; bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PNoModsDOT(SDValue In, SDValue &Src) const; bool SelectVOP3PModsF32(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PNoModsF32(SDValue In, SDValue &Src) const; bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 76915549ebdfa..d80f8cd37a104 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4458,6 +4458,21 @@ std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl( return std::pair(Src, Mods); } +std::pair<Register, unsigned> +AMDGPUInstructionSelector::selectVOP3PModsF32Impl(Register Src) const { + unsigned Mods = SISrcMods::OP_SEL_1; + if (Subtarget->isGFX11Plus()) { + unsigned ModsImpl; + std::tie(Src, ModsImpl) = selectVOP3ModsImpl(Src); + Mods |= ModsImpl; + if (Mods & SISrcMods::ABS) { + Mods ^= SISrcMods::ABS; + Mods |= SISrcMods::NEG_HI; + } + } + return std::pair(Src, Mods); +} + Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded( Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt, bool ForceVGPR) const { @@ -5164,26 +5179,43 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { return selectVOP3PRetHelper(Root, true); } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PNoModsDOT(MachineOperand &Root) const { + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true /*IsDOT*/); + if (Mods != SISrcMods::OP_SEL_1) + return {}; + + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3PModsF32(MachineOperand &Root) const { - Register Src = Root.getReg(); - unsigned Mods = SISrcMods::OP_SEL_1; - if (Subtarget->isGFX11Plus()) { - unsigned ModsImpl; - std::tie(Src, ModsImpl) = selectVOP3ModsImpl(Root.getReg()); - Mods |= ModsImpl; - if (Mods & SISrcMods::ABS) { - Mods ^= SISrcMods::ABS; - Mods |= SISrcMods::NEG_HI; - } - } + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + Register Reg; + unsigned Mods; + std::tie(Reg, Mods) = selectVOP3PModsF32Impl(Root.getReg()); + Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII); return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods }}; } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PNoModsF32(MachineOperand &Root) const { + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg()); + if (Mods != SISrcMods::OP_SEL_1) + return {}; + + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods( MachineOperand &Root) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index a67c5314eb6a5..6c71715975a11 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -162,6 +162,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool IsCanonicalizing = true, bool AllowAbs = true, bool OpSel = false) const; + std::pair<Register, unsigned> selectVOP3PModsF32Impl(Register Src) const; Register copyToVGPRIfSrcFolded(Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt, @@ -200,7 +201,11 @@ class AMDGPUInstructionSelector final : public InstructionSelector { InstructionSelector::ComplexRendererFns selectVOP3PModsDOT(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectVOP3PNoModsDOT(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectVOP3PModsF32(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVOP3PNoModsF32(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectWMMAOpSelVOP3PMods(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index e9c5c7e36285b..1a86d90ec8a6c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1692,7 +1692,9 @@ def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">; def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">; def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">; +def VOP3PNoModsDOT : ComplexPattern<untyped, 1, "SelectVOP3PNoModsDOT">; def VOP3PModsF32 : ComplexPattern<untyped, 2, "SelectVOP3PModsF32">; +def VOP3PNoModsF32 : ComplexPattern<untyped, 1, "SelectVOP3PNoModsF32">; def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">; diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 2ccf39224a278..300fb5831886f 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1302,13 +1302,19 @@ let Constraints = "$vdst = $src2", let AddedComplexity = 30 in { def : GCNPat< - (f32 (AMDGPUfdot2 v2f16:$src0, v2f16:$src1, f32:$src2, (i1 DSTCLAMP.NONE))), + (f32 (AMDGPUfdot2 (v2f16 (VOP3PNoModsDOT v2f16:$src0)), + (v2f16 (VOP3PNoModsDOT v2f16:$src1)), + (f32 (VOP3PNoModsF32 f32:$src2)), + (i1 DSTCLAMP.NONE))), (f32 (V_DOT2C_F32_F16_e32 $src0, $src1, $src2)) > { let SubtargetPredicate = HasDot5Insts; } def : GCNPat< - (f32 (int_amdgcn_fdot2_f32_bf16 v2bf16:$src0, v2bf16:$src1, f32:$src2, (i1 DSTCLAMP.NONE))), + (f32 (int_amdgcn_fdot2_f32_bf16 (v2bf16 (VOP3PNoModsDOT v2bf16:$src0)), + (v2bf16 (VOP3PNoModsDOT v2bf16:$src1)), + (f32 (VOP3PNoModsF32 f32:$src2)), + (i1 DSTCLAMP.NONE))), (f32 (V_DOT2C_F32_BF16_e32 $src0, $src1, $src2)) > { let SubtargetPredicate = HasDot13Insts; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll index 2c1aa25d79d71..72d5d102cefea 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -19,30 +19,18 @@ define float @v_fdot2_f32_bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c) { } define float @v_fdot2_f32_bf16_neg_a(<2 x bfloat> %a, <2 x bfloat> %b, float %c) { -; GFX950-LABEL: v_fdot2_f32_bf16_neg_a: -; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1 -; GFX950: v_mov_b32_e32 v0, v2 -; -; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-LABEL: v_fdot2_f32_bf16_neg_a: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] %neg.a = fneg <2 x bfloat> %a %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg.a, <2 x bfloat> %b, float %c, i1 false) ret float %r } define float @v_fdot2_f32_bf16_neg_b(<2 x bfloat> %a, <2 x bfloat> %b, float %c) { -; GFX950-LABEL: v_fdot2_f32_bf16_neg_b: -; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1 -; GFX950: v_mov_b32_e32 v0, v2 -; -; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-LABEL: v_fdot2_f32_bf16_neg_b: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] %neg.b = fneg <2 x bfloat> %b %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %neg.b, float %c, i1 false) ret float %r @@ -399,10 +387,9 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, define float @v_fdot2_f32_bf16_neg_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { ; GFX950-LABEL: v_fdot2_f32_bf16_neg_a_dual: ; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1 +; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_dual: ; GFX11PLUS: ; %bb.0: @@ -419,10 +406,9 @@ define float @v_fdot2_f32_bf16_neg_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, floa define float @v_fdot2_f32_bf16_neg_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { ; GFX950-LABEL: v_fdot2_f32_bf16_neg_b_dual: ; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1 +; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_dual: ; GFX11PLUS: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll index 6766270a0e2af..de0722681df28 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll @@ -35,62 +35,18 @@ define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) { } define float @v_fdot2_neg_a(<2 x half> %a, <2 x half> %b, float %c) { -; GFX906-LABEL: v_fdot2_neg_a: -; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] -; -; GFX950-LABEL: v_fdot2_neg_a: -; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1 -; GFX950: v_mov_b32_e32 v0, v2 -; -; GFX10-LABEL: v_fdot2_neg_a: -; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 -; -; GFX11-LABEL: v_fdot2_neg_a: -; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX11: v_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_mov_b32_e32 v0, v2 -; -; GFX12-LABEL: v_fdot2_neg_a: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-LABEL: v_fdot2_neg_a: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] %neg.a = fneg <2 x half> %a %r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false) ret float %r } define float @v_fdot2_neg_b(<2 x half> %a, <2 x half> %b, float %c) { -; GFX906-LABEL: v_fdot2_neg_b: -; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] -; -; GFX950-LABEL: v_fdot2_neg_b: -; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1 -; GFX950: v_mov_b32_e32 v0, v2 -; -; GFX10-LABEL: v_fdot2_neg_b: -; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 -; -; GFX11-LABEL: v_fdot2_neg_b: -; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX11: v_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_mov_b32_e32 v0, v2 -; -; GFX12-LABEL: v_fdot2_neg_b: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-LABEL: v_fdot2_neg_b: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] %neg.b = fneg <2 x half> %b %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false) ret float %r @@ -114,15 +70,9 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) { ; GFX10: v_dot2c_f32_f16 v2, v0, v1 ; GFX10: v_mov_b32_e32 v0, v2 ; -; GFX11-LABEL: v_fdot2_neg_c: -; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX11: v_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_mov_b32_e32 v0, v2 -; -; GFX12-LABEL: v_fdot2_neg_c: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] +; GFX11PLUS-LABEL: v_fdot2_neg_c: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] %neg.c = fneg float %c %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false) ret float %r @@ -146,15 +96,9 @@ define float @v_fdot2_abs_c(<2 x half> %a, <2 x half> %b, float %c) { ; GFX10: v_dot2c_f32_f16 v2, v0, v1 ; GFX10: v_mov_b32_e32 v0, v2 ; -; GFX11-LABEL: v_fdot2_abs_c: -; GFX11: ; %bb.0: -; GFX11: v_and_b32_e32 v2, 0x7fffffff, v2 -; GFX11: v_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_mov_b32_e32 v0, v2 -; -; GFX12-LABEL: v_fdot2_abs_c: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] +; GFX11PLUS-LABEL: v_fdot2_abs_c: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] %abs.c = call float @llvm.fabs.f32(float %c) %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %abs.c, i1 false) ret float %r @@ -174,9 +118,7 @@ define float @v_fdot2_opsel_lo_a(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_opsel_lo_a: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0] ; ; GFX11-LABEL: v_fdot2_opsel_lo_a: ; GFX11: ; %bb.0: @@ -207,9 +149,7 @@ define float @v_fdot2_opsel_hi_a(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_opsel_hi_a: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v0, v0, v0, 0x5040100 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1] ; ; GFX11-LABEL: v_fdot2_opsel_hi_a: ; GFX11: ; %bb.0: @@ -240,9 +180,7 @@ define float @v_fdot2_opsel_lo_b(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_opsel_lo_b: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v1, v1, v1, 0x7060302 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0] ; ; GFX11-LABEL: v_fdot2_opsel_lo_b: ; GFX11: ; %bb.0: @@ -273,9 +211,7 @@ define float @v_fdot2_opsel_hi_b(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_opsel_hi_b: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v1, v1, v1, 0x5040100 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1] ; ; GFX11-LABEL: v_fdot2_opsel_hi_b: ; GFX11: ; %bb.0: @@ -660,23 +596,21 @@ define float @v_fdot2_neg_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha ; ; GFX950-LABEL: v_fdot2_neg_a_dual: ; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1 +; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX10-LABEL: v_fdot2_neg_a_dual: ; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_neg_a_dual: ; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX11: v_dot2acc_f32_f16 v5, v3, v4 +; GFX11: v_add_f32_e32 v0, v0, v5 ; ; GFX12-LABEL: v_fdot2_neg_a_dual: ; GFX12: ; %bb.0: @@ -699,23 +633,21 @@ define float @v_fdot2_neg_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha ; ; GFX950-LABEL: v_fdot2_neg_b_dual: ; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1 +; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX10-LABEL: v_fdot2_neg_b_dual: ; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_neg_b_dual: ; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX11: v_dot2acc_f32_f16 v5, v3, v4 +; GFX11: v_add_f32_e32 v0, v0, v5 ; ; GFX12-LABEL: v_fdot2_neg_b_dual: ; GFX12: ; %bb.0: @@ -753,9 +685,9 @@ define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha ; ; GFX11-LABEL: v_fdot2_neg_c_dual: ; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] +; GFX11: v_dot2acc_f32_f16 v5, v3, v4 +; GFX11: v_add_f32_e32 v0, v0, v5 ; ; GFX12-LABEL: v_fdot2_neg_c_dual: ; GFX12: ; %bb.0: @@ -793,9 +725,9 @@ define float @v_fdot2_abs_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha ; ; GFX11-LABEL: v_fdot2_abs_c_dual: ; GFX11: ; %bb.0: -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_and_b32 v2, 0x7fffffff, v2 -; GFX11: v_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] +; GFX11: v_dot2acc_f32_f16 v5, v3, v4 +; GFX11: v_add_f32_e32 v0, v0, v5 ; ; GFX12-LABEL: v_fdot2_abs_c_dual: ; GFX12: ; %bb.0: @@ -826,10 +758,9 @@ define float @v_fdot2_opsel_lo_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_lo_a_dual: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_opsel_lo_a_dual: ; GFX11: ; %bb.0: @@ -867,10 +798,9 @@ define float @v_fdot2_opsel_hi_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_hi_a_dual: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v0, v0, v0, 0x5040100 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_opsel_hi_a_dual: ; GFX11: ; %bb.0: @@ -908,10 +838,9 @@ define float @v_fdot2_opsel_lo_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_lo_b_dual: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v1, v1, v1, 0x7060302 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_opsel_lo_b_dual: ; GFX11: ; %bb.0: @@ -949,10 +878,9 @@ define float @v_fdot2_opsel_hi_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_hi_b_dual: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v1, v1, v1, 0x5040100 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_opsel_hi_b_dual: ; GFX11: ; %bb.0: _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
