https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/179225
>From 14cb3bad7b0f08f19156a9a9f0388280ce88d405 Mon Sep 17 00:00:00 2001 From: Petar Avramovic <[email protected]> Date: Mon, 2 Feb 2026 13:05:03 +0100 Subject: [PATCH] AMDGPU: Improve codegen for VOP2 v_dot2c_f32_f16/bf16 Select VOP2 version when there are no src_modifers, otherwise VOP3. --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 8 + llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 22 ++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 2 + .../AMDGPU/AMDGPUInstructionSelector.cpp | 56 ++++- .../Target/AMDGPU/AMDGPUInstructionSelector.h | 5 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 + llvm/lib/Target/AMDGPU/VOP2Instructions.td | 10 +- .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll | 34 +-- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll | 208 +++++------------- 9 files changed, 157 insertions(+), 190 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 7a854d7acf84a..fcfd07cc1d0e2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -51,10 +51,18 @@ def gi_vop3pmodsdot : GIComplexOperandMatcher<s32, "selectVOP3PModsDOT">, GIComplexPatternEquiv<VOP3PModsDOT>; +def gi_vop3pnomodsdot : + GIComplexOperandMatcher<s32, "selectVOP3PNoModsDOT">, + GIComplexPatternEquiv<VOP3PNoModsDOT>; + def gi_vop3pmodsf32 : GIComplexOperandMatcher<s32, "selectVOP3PModsF32">, GIComplexPatternEquiv<VOP3PModsF32>; +def gi_vop3pnomodsf32 : + GIComplexOperandMatcher<s32, "selectVOP3PNoModsF32">, + GIComplexPatternEquiv<VOP3PNoModsF32>; + def gi_wmmaopselvop3pmods : GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">, GIComplexPatternEquiv<WMMAOpSelVOP3PMods>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 4fdf222abb017..b9694273476f7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3658,6 +3658,17 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src, return SelectVOP3PMods(In, Src, SrcMods, true); } +bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsDOT(SDValue In, SDValue &Src) const { + SDValue SrcTmp, SrcModsTmp; + SelectVOP3PMods(In, SrcTmp, SrcModsTmp, true); + if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) { + Src = SrcTmp; + return true; + } + + return false; +} + bool AMDGPUDAGToDAGISel::SelectVOP3PModsF32(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = SISrcMods::OP_SEL_1; @@ -3674,6 +3685,17 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsF32(SDValue In, SDValue &Src, return true; } +bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsF32(SDValue In, SDValue &Src) const { + SDValue SrcTmp, SrcModsTmp; + SelectVOP3PModsF32(In, SrcTmp, SrcModsTmp); + if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) { + Src = SrcTmp; + return true; + } + + return false; +} + bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const { const ConstantSDNode *C = cast<ConstantSDNode>(In); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 43550c7ab53f8..5c13072005a3c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -233,7 +233,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods, bool IsDOT = false) const; bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PNoModsDOT(SDValue In, SDValue &Src) const; bool SelectVOP3PModsF32(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PNoModsF32(SDValue In, SDValue &Src) const; bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 76915549ebdfa..d80f8cd37a104 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4458,6 +4458,21 @@ std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl( return std::pair(Src, Mods); } +std::pair<Register, unsigned> +AMDGPUInstructionSelector::selectVOP3PModsF32Impl(Register Src) const { + unsigned Mods = SISrcMods::OP_SEL_1; + if (Subtarget->isGFX11Plus()) { + unsigned ModsImpl; + std::tie(Src, ModsImpl) = selectVOP3ModsImpl(Src); + Mods |= ModsImpl; + if (Mods & SISrcMods::ABS) { + Mods ^= SISrcMods::ABS; + Mods |= SISrcMods::NEG_HI; + } + } + return std::pair(Src, Mods); +} + Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded( Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt, bool ForceVGPR) const { @@ -5164,26 +5179,43 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { return selectVOP3PRetHelper(Root, true); } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PNoModsDOT(MachineOperand &Root) const { + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true /*IsDOT*/); + if (Mods != SISrcMods::OP_SEL_1) + return {}; + + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3PModsF32(MachineOperand &Root) const { - Register Src = Root.getReg(); - unsigned Mods = SISrcMods::OP_SEL_1; - if (Subtarget->isGFX11Plus()) { - unsigned ModsImpl; - std::tie(Src, ModsImpl) = selectVOP3ModsImpl(Root.getReg()); - Mods |= ModsImpl; - if (Mods & SISrcMods::ABS) { - Mods ^= SISrcMods::ABS; - Mods |= SISrcMods::NEG_HI; - } - } + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + Register Reg; + unsigned Mods; + std::tie(Reg, Mods) = selectVOP3PModsF32Impl(Root.getReg()); + Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII); return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods }}; } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PNoModsF32(MachineOperand &Root) const { + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg()); + if (Mods != SISrcMods::OP_SEL_1) + return {}; + + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods( MachineOperand &Root) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index a67c5314eb6a5..6c71715975a11 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -162,6 +162,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool IsCanonicalizing = true, bool AllowAbs = true, bool OpSel = false) const; + std::pair<Register, unsigned> selectVOP3PModsF32Impl(Register Src) const; Register copyToVGPRIfSrcFolded(Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt, @@ -200,7 +201,11 @@ class AMDGPUInstructionSelector final : public InstructionSelector { InstructionSelector::ComplexRendererFns selectVOP3PModsDOT(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectVOP3PNoModsDOT(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectVOP3PModsF32(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVOP3PNoModsF32(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectWMMAOpSelVOP3PMods(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index e9c5c7e36285b..1a86d90ec8a6c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1692,7 +1692,9 @@ def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">; def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">; def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">; +def VOP3PNoModsDOT : ComplexPattern<untyped, 1, "SelectVOP3PNoModsDOT">; def VOP3PModsF32 : ComplexPattern<untyped, 2, "SelectVOP3PModsF32">; +def VOP3PNoModsF32 : ComplexPattern<untyped, 1, "SelectVOP3PNoModsF32">; def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">; diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 2ccf39224a278..300fb5831886f 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1302,13 +1302,19 @@ let Constraints = "$vdst = $src2", let AddedComplexity = 30 in { def : GCNPat< - (f32 (AMDGPUfdot2 v2f16:$src0, v2f16:$src1, f32:$src2, (i1 DSTCLAMP.NONE))), + (f32 (AMDGPUfdot2 (v2f16 (VOP3PNoModsDOT v2f16:$src0)), + (v2f16 (VOP3PNoModsDOT v2f16:$src1)), + (f32 (VOP3PNoModsF32 f32:$src2)), + (i1 DSTCLAMP.NONE))), (f32 (V_DOT2C_F32_F16_e32 $src0, $src1, $src2)) > { let SubtargetPredicate = HasDot5Insts; } def : GCNPat< - (f32 (int_amdgcn_fdot2_f32_bf16 v2bf16:$src0, v2bf16:$src1, f32:$src2, (i1 DSTCLAMP.NONE))), + (f32 (int_amdgcn_fdot2_f32_bf16 (v2bf16 (VOP3PNoModsDOT v2bf16:$src0)), + (v2bf16 (VOP3PNoModsDOT v2bf16:$src1)), + (f32 (VOP3PNoModsF32 f32:$src2)), + (i1 DSTCLAMP.NONE))), (f32 (V_DOT2C_F32_BF16_e32 $src0, $src1, $src2)) > { let SubtargetPredicate = HasDot13Insts; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll index 600000144887d..a16971843c247 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -19,15 +19,9 @@ define float @v_fdot2_f32_bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c) { } define float @v_fdot2_f32_bf16_neg_a(<2 x bfloat> %a, <2 x bfloat> %b, float %c) { -; GFX950-LABEL: v_fdot2_f32_bf16_neg_a: -; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1 -; GFX950: v_mov_b32_e32 v0, v2 -; -; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-LABEL: v_fdot2_f32_bf16_neg_a: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] %neg.a = fneg <2 x bfloat> %a %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg.a, <2 x bfloat> %b, float %c, i1 false) ret float %r @@ -88,15 +82,9 @@ define float @v_fdot2_f32_bf16_neg_a_hi(<2 x bfloat> %a, <2 x bfloat> %b, float } define float @v_fdot2_f32_bf16_neg_b(<2 x bfloat> %a, <2 x bfloat> %b, float %c) { -; GFX950-LABEL: v_fdot2_f32_bf16_neg_b: -; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1 -; GFX950: v_mov_b32_e32 v0, v2 -; -; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b: -; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-LABEL: v_fdot2_f32_bf16_neg_b: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] %neg.b = fneg <2 x bfloat> %b %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %neg.b, float %c, i1 false) ret float %r @@ -507,10 +495,9 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, define float @v_fdot2_f32_bf16_neg_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { ; GFX950-LABEL: v_fdot2_f32_bf16_neg_a_dual: ; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1 +; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_dual: ; GFX11PLUS: ; %bb.0: @@ -595,10 +582,9 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f define float @v_fdot2_f32_bf16_neg_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) { ; GFX950-LABEL: v_fdot2_f32_bf16_neg_b_dual: ; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1 +; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_dual: ; GFX11PLUS: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll index 991eac36f9d80..df489419d7057 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll @@ -35,31 +35,9 @@ define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) { } define float @v_fdot2_neg_a(<2 x half> %a, <2 x half> %b, float %c) { -; GFX906-LABEL: v_fdot2_neg_a: -; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] -; -; GFX950-LABEL: v_fdot2_neg_a: -; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1 -; GFX950: v_mov_b32_e32 v0, v2 -; -; GFX10-LABEL: v_fdot2_neg_a: -; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 -; -; GFX11-LABEL: v_fdot2_neg_a: -; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX11: v_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_mov_b32_e32 v0, v2 -; -; GFX12-LABEL: v_fdot2_neg_a: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GCN-LABEL: v_fdot2_neg_a: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] %neg.a = fneg <2 x half> %a %r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false) ret float %r @@ -80,10 +58,7 @@ define float @v_fdot2_neg_a_lo(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_neg_a_lo: ; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v3, 0x8000, v0 -; GFX10: v_bfi_b32 v0, 0xffff, v3, v0 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] ; ; GFX11-LABEL: v_fdot2_neg_a_lo: ; GFX11: ; %bb.0: @@ -119,11 +94,7 @@ define float @v_fdot2_neg_a_hi(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_neg_a_hi: ; GFX10: ; %bb.0: -; GFX10: v_mov_b32_e32 v3, 0x8000 -; GFX10: v_xor_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10: v_perm_b32 v0, v3, v0, 0x5040100 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[1,0,0] ; ; GFX11-LABEL: v_fdot2_neg_a_hi: ; GFX11: ; %bb.0: @@ -145,31 +116,9 @@ define float @v_fdot2_neg_a_hi(<2 x half> %a, <2 x half> %b, float %c) { } define float @v_fdot2_neg_b(<2 x half> %a, <2 x half> %b, float %c) { -; GFX906-LABEL: v_fdot2_neg_b: -; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] -; -; GFX950-LABEL: v_fdot2_neg_b: -; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1 -; GFX950: v_mov_b32_e32 v0, v2 -; -; GFX10-LABEL: v_fdot2_neg_b: -; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 -; -; GFX11-LABEL: v_fdot2_neg_b: -; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX11: v_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_mov_b32_e32 v0, v2 -; -; GFX12-LABEL: v_fdot2_neg_b: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GCN-LABEL: v_fdot2_neg_b: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] %neg.b = fneg <2 x half> %b %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false) ret float %r @@ -190,10 +139,7 @@ define float @v_fdot2_neg_b_lo(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_neg_b_lo: ; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v3, 0x8000, v1 -; GFX10: v_bfi_b32 v1, 0xffff, v3, v1 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] ; ; GFX11-LABEL: v_fdot2_neg_b_lo: ; GFX11: ; %bb.0: @@ -229,11 +175,7 @@ define float @v_fdot2_neg_b_hi(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_neg_b_hi: ; GFX10: ; %bb.0: -; GFX10: v_mov_b32_e32 v3, 0x8000 -; GFX10: v_xor_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10: v_perm_b32 v1, v3, v1, 0x5040100 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,1,0] ; ; GFX11-LABEL: v_fdot2_neg_b_hi: ; GFX11: ; %bb.0: @@ -272,15 +214,9 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) { ; GFX10: v_dot2c_f32_f16 v2, v0, v1 ; GFX10: v_mov_b32_e32 v0, v2 ; -; GFX11-LABEL: v_fdot2_neg_c: -; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX11: v_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_mov_b32_e32 v0, v2 -; -; GFX12-LABEL: v_fdot2_neg_c: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] +; GFX11PLUS-LABEL: v_fdot2_neg_c: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] %neg.c = fneg float %c %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false) ret float %r @@ -304,15 +240,9 @@ define float @v_fdot2_abs_c(<2 x half> %a, <2 x half> %b, float %c) { ; GFX10: v_dot2c_f32_f16 v2, v0, v1 ; GFX10: v_mov_b32_e32 v0, v2 ; -; GFX11-LABEL: v_fdot2_abs_c: -; GFX11: ; %bb.0: -; GFX11: v_and_b32_e32 v2, 0x7fffffff, v2 -; GFX11: v_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_mov_b32_e32 v0, v2 -; -; GFX12-LABEL: v_fdot2_abs_c: -; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] +; GFX11PLUS-LABEL: v_fdot2_abs_c: +; GFX11PLUS: ; %bb.0: +; GFX11PLUS: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] %abs.c = call float @llvm.fabs.f32(float %c) %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %abs.c, i1 false) ret float %r @@ -332,9 +262,7 @@ define float @v_fdot2_opsel_lo_a(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_opsel_lo_a: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0] ; ; GFX11-LABEL: v_fdot2_opsel_lo_a: ; GFX11: ; %bb.0: @@ -365,9 +293,7 @@ define float @v_fdot2_opsel_hi_a(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_opsel_hi_a: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v0, v0, v0, 0x5040100 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1] ; ; GFX11-LABEL: v_fdot2_opsel_hi_a: ; GFX11: ; %bb.0: @@ -398,9 +324,7 @@ define float @v_fdot2_opsel_lo_b(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_opsel_lo_b: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v1, v1, v1, 0x7060302 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0] ; ; GFX11-LABEL: v_fdot2_opsel_lo_b: ; GFX11: ; %bb.0: @@ -431,9 +355,7 @@ define float @v_fdot2_opsel_hi_b(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX10-LABEL: v_fdot2_opsel_hi_b: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v1, v1, v1, 0x5040100 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1] ; ; GFX11-LABEL: v_fdot2_opsel_hi_b: ; GFX11: ; %bb.0: @@ -818,23 +740,21 @@ define float @v_fdot2_neg_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha ; ; GFX950-LABEL: v_fdot2_neg_a_dual: ; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1 +; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX10-LABEL: v_fdot2_neg_a_dual: ; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_neg_a_dual: ; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX11: v_dot2acc_f32_f16 v5, v3, v4 +; GFX11: v_add_f32_e32 v0, v0, v5 ; ; GFX12-LABEL: v_fdot2_neg_a_dual: ; GFX12: ; %bb.0: @@ -866,11 +786,9 @@ define float @v_fdot2_neg_a_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ; ; GFX10-LABEL: v_fdot2_neg_a_lo_dual: ; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v6, 0x8000, v0 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_bfi_b32 v0, 0xffff, v6, v0 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_neg_a_lo_dual: ; GFX11: ; %bb.0: @@ -913,12 +831,9 @@ define float @v_fdot2_neg_a_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ; ; GFX10-LABEL: v_fdot2_neg_a_hi_dual: ; GFX10: ; %bb.0: -; GFX10: v_mov_b32_e32 v6, 0x8000 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[1,0,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_xor_b32_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10: v_perm_b32 v0, v6, v0, 0x5040100 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_neg_a_hi_dual: ; GFX11: ; %bb.0: @@ -952,23 +867,21 @@ define float @v_fdot2_neg_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha ; ; GFX950-LABEL: v_fdot2_neg_b_dual: ; GFX950: ; %bb.0: -; GFX950: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1 +; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4 -; GFX950: v_add_f32_e32 v0, v2, v5 +; GFX950: v_add_f32_e32 v0, v0, v5 ; ; GFX10-LABEL: v_fdot2_neg_b_dual: ; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_neg_b_dual: ; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX11: v_dot2acc_f32_f16 v5, v3, v4 +; GFX11: v_add_f32_e32 v0, v0, v5 ; ; GFX12-LABEL: v_fdot2_neg_b_dual: ; GFX12: ; %bb.0: @@ -1000,11 +913,9 @@ define float @v_fdot2_neg_b_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ; ; GFX10-LABEL: v_fdot2_neg_b_lo_dual: ; GFX10: ; %bb.0: -; GFX10: v_xor_b32_e32 v6, 0x8000, v1 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_bfi_b32 v1, 0xffff, v6, v1 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_neg_b_lo_dual: ; GFX11: ; %bb.0: @@ -1047,12 +958,9 @@ define float @v_fdot2_neg_b_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ; ; GFX10-LABEL: v_fdot2_neg_b_hi_dual: ; GFX10: ; %bb.0: -; GFX10: v_mov_b32_e32 v6, 0x8000 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,1,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_xor_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10: v_perm_b32 v1, v6, v1, 0x5040100 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_neg_b_hi_dual: ; GFX11: ; %bb.0: @@ -1101,9 +1009,9 @@ define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha ; ; GFX11-LABEL: v_fdot2_neg_c_dual: ; GFX11: ; %bb.0: -; GFX11: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] +; GFX11: v_dot2acc_f32_f16 v5, v3, v4 +; GFX11: v_add_f32_e32 v0, v0, v5 ; ; GFX12-LABEL: v_fdot2_neg_c_dual: ; GFX12: ; %bb.0: @@ -1141,9 +1049,9 @@ define float @v_fdot2_abs_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha ; ; GFX11-LABEL: v_fdot2_abs_c_dual: ; GFX11: ; %bb.0: -; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_and_b32 v2, 0x7fffffff, v2 -; GFX11: v_dot2acc_f32_f16 v2, v0, v1 -; GFX11: v_add_f32_e32 v0, v2, v5 +; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] +; GFX11: v_dot2acc_f32_f16 v5, v3, v4 +; GFX11: v_add_f32_e32 v0, v0, v5 ; ; GFX12-LABEL: v_fdot2_abs_c_dual: ; GFX12: ; %bb.0: @@ -1174,10 +1082,9 @@ define float @v_fdot2_opsel_lo_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_lo_a_dual: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_opsel_lo_a_dual: ; GFX11: ; %bb.0: @@ -1215,10 +1122,9 @@ define float @v_fdot2_opsel_hi_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_hi_a_dual: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v0, v0, v0, 0x5040100 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_opsel_hi_a_dual: ; GFX11: ; %bb.0: @@ -1256,10 +1162,9 @@ define float @v_fdot2_opsel_lo_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_lo_b_dual: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v1, v1, v1, 0x7060302 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_opsel_lo_b_dual: ; GFX11: ; %bb.0: @@ -1297,10 +1202,9 @@ define float @v_fdot2_opsel_hi_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 ; ; GFX10-LABEL: v_fdot2_opsel_hi_b_dual: ; GFX10: ; %bb.0: -; GFX10: v_perm_b32 v1, v1, v1, 0x5040100 +; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1] ; GFX10: v_dot2c_f32_f16 v5, v3, v4 -; GFX10: v_dot2c_f32_f16 v2, v0, v1 -; GFX10: v_add_f32_e32 v0, v2, v5 +; GFX10: v_add_f32_e32 v0, v0, v5 ; ; GFX11-LABEL: v_fdot2_opsel_hi_b_dual: ; GFX11: ; %bb.0: _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
