https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/179224
>From bacdcb5d815f7117361455068c9087fc45d8c94c Mon Sep 17 00:00:00 2001 From: Petar Avramovic <[email protected]> Date: Mon, 23 Mar 2026 13:02:34 +0100 Subject: [PATCH] AMDGPU: Fix src2_modifiers for v_dot2_f32_f16/bf16 --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 4 + llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 9 + llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 1 + .../AMDGPU/AMDGPUInstructionSelector.cpp | 13 ++ .../Target/AMDGPU/AMDGPUInstructionSelector.h | 2 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 + llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 28 ++- llvm/lib/Target/AMDGPU/VOPInstructions.td | 6 +- .../AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll | 163 ++++++++++++------ .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll | 15 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll | 35 ++-- 11 files changed, 192 insertions(+), 86 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 84c0348c1d611..de8722841d3fe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -51,6 +51,10 @@ def gi_vop3pmodsdot : GIComplexOperandMatcher<s32, "selectVOP3PModsDOT">, GIComplexPatternEquiv<VOP3PModsDOT>; +def gi_vop3pmodsf32 : + GIComplexOperandMatcher<s32, "selectVOP3PModsF32">, + GIComplexPatternEquiv<VOP3PModsF32>; + def gi_wmmaopselvop3pmods : GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">, GIComplexPatternEquiv<WMMAOpSelVOP3PMods>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 749450aaf0344..613dcfeb646a2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3691,6 +3691,15 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src, return SelectVOP3PMods(In, Src, SrcMods, true); } +bool AMDGPUDAGToDAGISel::SelectVOP3PModsF32(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + SelectVOP3Mods(In, Src, SrcMods); + unsigned Mods = SISrcMods::OP_SEL_1; + Mods |= cast<ConstantSDNode>(SrcMods)->getZExtValue(); + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const { const ConstantSDNode *C = cast<ConstantSDNode>(In); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index ffeb6dfdb3f90..8b12d1d2a800f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -233,6 +233,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods, bool IsDOT = false) const; bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PModsF32(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index eb0b05a45d47d..80b30b98ab590 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -5269,6 +5269,19 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { return selectVOP3PRetHelper(Root, true); } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PModsF32(MachineOperand &Root) const { + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); + Mods |= SISrcMods::OP_SEL_1; + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods( MachineOperand &Root) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index cc121632e101d..2c9ecc207d8bd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -200,6 +200,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector { InstructionSelector::ComplexRendererFns selectVOP3PModsDOT(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVOP3PModsF32(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectWMMAOpSelVOP3PMods(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index d9b40beaf7318..229cac30d4165 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1706,6 +1706,8 @@ def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">; def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">; def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">; +def VOP3PModsF32 : ComplexPattern<untyped, 2, "SelectVOP3PModsF32">; + def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">; def WMMAModsF32NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF32NegAbs">; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 992c375069e77..9bde8634e2ee2 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -86,6 +86,21 @@ multiclass VOP3PInst<string OpName, VOPProfile P, } // end SubtargetPredicate = isGFX11Plus } +multiclass VOP3PInstDotWithDual<string OpName, VOPProfile P, + SDPatternOperator node = null_frag> { + def NAME : VOP3P_Pseudo<OpName, P, + getVOP3PModPat<P, node, + 1 /*HasExplicitClamp*/, 1/*IsDOT*/, + VOP3PModsDOT, VOP3PModsF32>.ret>; + let SubtargetPredicate = isGFX11Plus in { + if P.HasExtVOP3DPP then + def _dpp : VOP3_DPP_Pseudo<OpName, P> { + let VOP3P = 1; + let PseudoInstr = OpName #"_dpp"; + } + } // end SubtargetPredicate = isGFX11Plus +} + // Non-packed instructions that use the VOP3P encoding. // VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed. multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> { @@ -598,9 +613,11 @@ defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", } // End OtherPredicates = [HasDot2Insts] let OtherPredicates = [HasDot10Insts] in -defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", - VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR, /*HasDPP*/ 1>, - AMDGPUfdot2, 1/*ExplicitClamp*/>; +defm V_DOT2_F32_F16 : + VOP3PInstDotWithDual<"v_dot2_f32_f16", + VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR, + /*HasDPP*/ 1>, + AMDGPUfdot2>; let OtherPredicates = [HasDot7Insts] in { defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", @@ -623,8 +640,9 @@ def DOT2_BF16_Profile let SubtargetPredicate = HasDot12Insts in { -defm V_DOT2_F32_BF16 : VOP3PInst<"v_dot2_f32_bf16", DOT2_BF16_Profile, - int_amdgcn_fdot2_f32_bf16, 1>; +defm V_DOT2_F32_BF16 : + VOP3PInstDotWithDual<"v_dot2_f32_bf16", DOT2_BF16_Profile, + int_amdgcn_fdot2_f32_bf16>; } // End SubtargetPredicate = HasDot12Insts diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 9d56aa4ad5cb0..82545a472cf17 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -1380,10 +1380,12 @@ class getVOP3ModPat<VOPProfile P, SDPatternOperator node> { class getVOP3PModPat<VOPProfile P, SDPatternOperator node, bit HasExplicitClamp, bit IsDOT = 0, - ComplexPattern SrcPat = !if(IsDOT, VOP3PModsDOT, VOP3PMods)> { + ComplexPattern SrcPat = !if(IsDOT, VOP3PModsDOT, + VOP3PMods), + ComplexPattern Src2Pat = SrcPat> { dag src0_dag = (P.Src0VT (SrcPat P.Src0VT:$src0, i32:$src0_modifiers)); dag src1_dag = (P.Src1VT (SrcPat P.Src1VT:$src1, i32:$src1_modifiers)); - dag src2_dag = (P.Src2VT (SrcPat P.Src2VT:$src2, i32:$src2_modifiers)); + dag src2_dag = (P.Src2VT (Src2Pat P.Src2VT:$src2, i32:$src2_modifiers)); dag clamp_dag = (i1 timm:$clamp); list<dag> ret3 = [(set P.DstVT:$vdst, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll index cd8ce7a408370..0d93cfe52af54 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll @@ -1,41 +1,51 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10PLUS %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10PLUS %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX10PLUS %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "s_wait" --filter-out "s_nop" --filter-out "s_delay_alu" --filter-out "s_setpc_b64" +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefixes=GCN,GFX906 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefixes=GCN,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefixes=GCN,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GCN,GFX11 %s define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) { ; GFX906-LABEL: v_fdot2: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX906-NEXT: s_setpc_b64 s[30:31] +; GFX906: ; %bb.0: +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 +; +; GFX10-LABEL: v_fdot2: +; GFX10: ; %bb.0: +; GFX10: v_dot2c_f32_f16 v2, v0, v1 +; GFX10: v_mov_b32_e32 v0, v2 +; +; GFX11-LABEL: v_fdot2: +; GFX11: ; %bb.0: +; GFX11: v_dot2acc_f32_f16 v2, v0, v1 +; GFX11: v_mov_b32_e32 v0, v2 %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false) ret float %r } define float @v_fdot2_clamp(<2 x half> %a, <2 x half> %b, float %c) { -; GFX906-LABEL: v_fdot2_clamp: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 clamp -; GFX906-NEXT: s_setpc_b64 s[30:31] -; -; GFX10PLUS-LABEL: v_fdot2_clamp: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 clamp -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_fdot2_clamp: +; GCN: ; %bb.0: +; GCN: v_dot2_f32_f16 v0, v0, v1, v2 clamp %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 true) ret float %r } define float @v_fdot2_neg_a(<2 x half> %a, <2 x half> %b, float %c) { ; GFX906-LABEL: v_fdot2_neg_a: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX906-NEXT: s_setpc_b64 s[30:31] +; GFX906: ; %bb.0: +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; +; GFX10-LABEL: v_fdot2_neg_a: +; GFX10: ; %bb.0: +; GFX10: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX10: v_dot2c_f32_f16 v2, v0, v1 +; GFX10: v_mov_b32_e32 v0, v2 +; +; GFX11-LABEL: v_fdot2_neg_a: +; GFX11: ; %bb.0: +; GFX11: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX11: v_dot2acc_f32_f16 v2, v0, v1 +; GFX11: v_mov_b32_e32 v0, v2 %neg.a = fneg <2 x half> %a %r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false) ret float %r @@ -43,10 +53,20 @@ define float @v_fdot2_neg_a(<2 x half> %a, <2 x half> %b, float %c) { define float @v_fdot2_neg_b(<2 x half> %a, <2 x half> %b, float %c) { ; GFX906-LABEL: v_fdot2_neg_b: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX906-NEXT: s_setpc_b64 s[30:31] +; GFX906: ; %bb.0: +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; +; GFX10-LABEL: v_fdot2_neg_b: +; GFX10: ; %bb.0: +; GFX10: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX10: v_dot2c_f32_f16 v2, v0, v1 +; GFX10: v_mov_b32_e32 v0, v2 +; +; GFX11-LABEL: v_fdot2_neg_b: +; GFX11: ; %bb.0: +; GFX11: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX11: v_dot2acc_f32_f16 v2, v0, v1 +; GFX11: v_mov_b32_e32 v0, v2 %neg.b = fneg <2 x half> %b %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false) ret float %r @@ -54,10 +74,20 @@ define float @v_fdot2_neg_b(<2 x half> %a, <2 x half> %b, float %c) { define float @v_fdot2_neg_a_neg_b(<2 x half> %a, <2 x half> %b, float %c) { ; GFX906-LABEL: v_fdot2_neg_a_neg_b: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] -; GFX906-NEXT: s_setpc_b64 s[30:31] +; GFX906: ; %bb.0: +; GFX906: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] +; +; GFX10-LABEL: v_fdot2_neg_a_neg_b: +; GFX10: ; %bb.0: +; GFX10: v_mov_b32_e32 v0, v2 +; GFX10: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX10: v_dot2c_f32_f16 v0, v1, v1 +; +; GFX11-LABEL: v_fdot2_neg_a_neg_b: +; GFX11: ; %bb.0: +; GFX11: v_mov_b32_e32 v0, v2 +; GFX11: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX11: v_dot2acc_f32_f16 v0, v1, v1 %neg.a = fneg <2 x half> %b %neg.b = fneg <2 x half> %b %r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %neg.b, float %c, i1 false) @@ -66,11 +96,20 @@ define float @v_fdot2_neg_a_neg_b(<2 x half> %a, <2 x half> %b, float %c) { define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) { ; GFX906-LABEL: v_fdot2_neg_c: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX906-NEXT: s_setpc_b64 s[30:31] +; GFX906: ; %bb.0: +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] +; +; GFX10-LABEL: v_fdot2_neg_c: +; GFX10: ; %bb.0: +; GFX10: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX10: v_dot2c_f32_f16 v2, v0, v1 +; GFX10: v_mov_b32_e32 v0, v2 +; +; GFX11-LABEL: v_fdot2_neg_c: +; GFX11: ; %bb.0: +; GFX11: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX11: v_dot2acc_f32_f16 v2, v0, v1 +; GFX11: v_mov_b32_e32 v0, v2 %neg.c = fneg float %c %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false) ret float %r @@ -78,30 +117,56 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) { define float @v_fdot2_inline_literal_a(<2 x half> %b, float %c) { ; GFX906-LABEL: v_fdot2_inline_literal_a: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1] -; GFX906-NEXT: s_setpc_b64 s[30:31] +; GFX906: ; %bb.0: +; GFX906: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1] +; +; GFX10-LABEL: v_fdot2_inline_literal_a: +; GFX10: ; %bb.0: +; GFX10: v_dot2c_f32_f16 v1, 0x40004000, v0 +; GFX10: v_mov_b32_e32 v0, v1 +; +; GFX11-LABEL: v_fdot2_inline_literal_a: +; GFX11: ; %bb.0: +; GFX11: v_dot2acc_f32_f16 v1, 0x40004000, v0 +; GFX11: v_mov_b32_e32 v0, v1 %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %b, float %c, i1 false) ret float %ret } define float @v_fdot2_inline_literal_b(<2 x half> %a, float %c) { ; GFX906-LABEL: v_fdot2_inline_literal_b: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1] -; GFX906-NEXT: s_setpc_b64 s[30:31] +; GFX906: ; %bb.0: +; GFX906: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1] +; +; GFX10-LABEL: v_fdot2_inline_literal_b: +; GFX10: ; %bb.0: +; GFX10: v_dot2c_f32_f16 v1, 0x40004000, v0 +; GFX10: v_mov_b32_e32 v0, v1 +; +; GFX11-LABEL: v_fdot2_inline_literal_b: +; GFX11: ; %bb.0: +; GFX11: v_dot2acc_f32_f16 v1, 0x40004000, v0 +; GFX11: v_mov_b32_e32 v0, v1 %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, half 2.0>, float %c, i1 false) ret float %ret } define float @v_fdot2_inline_literal_c(<2 x half> %a, <2 x half> %b) { ; GFX906-LABEL: v_fdot2_inline_literal_c: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, 1.0 -; GFX906-NEXT: s_setpc_b64 s[30:31] +; GFX906: ; %bb.0: +; GFX906: v_dot2_f32_f16 v0, v0, v1, 1.0 +; +; GFX10-LABEL: v_fdot2_inline_literal_c: +; GFX10: ; %bb.0: +; GFX10: v_mov_b32_e32 v2, 1.0 +; GFX10: v_dot2c_f32_f16 v2, v0, v1 +; GFX10: v_mov_b32_e32 v0, v2 +; +; GFX11-LABEL: v_fdot2_inline_literal_c: +; GFX11: ; %bb.0: +; GFX11: v_mov_b32_e32 v2, 1.0 +; GFX11: v_dot2acc_f32_f16 v2, v0, v1 +; GFX11: v_mov_b32_e32 v0, v2 %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 1.0, i1 false) ret float %ret } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll index ce5de94117210..6cfa02501adc5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -165,7 +165,7 @@ define float @v_fdot2_f32_bf16_neg_c(<2 x bfloat> %a, <2 x bfloat> %b, float %c) ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_c: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] %neg.c = fneg float %c %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %neg.c, i1 false) ret float %r @@ -180,8 +180,7 @@ define float @v_fdot2_f32_bf16_abs_c(<2 x bfloat> %a, <2 x bfloat> %b, float %c) ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_abs_c: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_and_b32_e32 v2, 0x7fffffff, v2 -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1] %abs.c = call float @llvm.fabs.f32(float %c) %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %abs.c, i1 false) ret float %r @@ -344,7 +343,7 @@ define float @v_fdot2_f32_bf16_neg_b_clamp(<2 x bfloat> %a, <2 x bfloat> %b, flo define float @v_fdot2_f32_bf16_neg_c_clamp(<2 x bfloat> %a, <2 x bfloat> %b, float %c) { ; GCN-LABEL: v_fdot2_f32_bf16_neg_c_clamp: ; GCN: ; %bb.0: -; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] clamp +; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] clamp %neg.c = fneg float %c %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %neg.c, i1 true) ret float %r @@ -353,8 +352,7 @@ define float @v_fdot2_f32_bf16_neg_c_clamp(<2 x bfloat> %a, <2 x bfloat> %b, flo define float @v_fdot2_f32_bf16_abs_c_clamp(<2 x bfloat> %a, <2 x bfloat> %b, float %c) { ; GCN-LABEL: v_fdot2_f32_bf16_abs_c_clamp: ; GCN: ; %bb.0: -; GCN: v_and_b32_e32 v2, 0x7fffffff, v2 -; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 clamp +; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1] clamp %abs.c = call float @llvm.fabs.f32(float %c) %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %abs.c, i1 true) ret float %r @@ -682,7 +680,7 @@ define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, floa ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_c_dual: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] ; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 ; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %neg.c = fneg float %c @@ -702,8 +700,7 @@ define float @v_fdot2_f32_bf16_abs_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, floa ; ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_abs_c_dual: ; GFX11PLUS: ; %bb.0: -; GFX11PLUS: v_and_b32_e32 v2, 0x7fffffff, v2 -; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 +; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1] ; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5 ; GFX11PLUS: v_add_f32_e32 v0, v0, v1 %abs.c = call float @llvm.fabs.f32(float %c) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll index 3312f29470066..c0f1240e4ef05 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll @@ -290,7 +290,7 @@ define float @v_fdot2_neg_b_hi(<2 x half> %a, <2 x half> %b, float %c) { define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) { ; GFX906-LABEL: v_fdot2_neg_c: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] ; ; GFX950-LABEL: v_fdot2_neg_c: ; GFX950: ; %bb.0: @@ -312,11 +312,11 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX1170-LABEL: v_fdot2_neg_c: ; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] ; ; GFX12-LABEL: v_fdot2_neg_c: ; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] %neg.c = fneg float %c %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false) ret float %r @@ -325,8 +325,7 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) { define float @v_fdot2_abs_c(<2 x half> %a, <2 x half> %b, float %c) { ; GFX906-LABEL: v_fdot2_abs_c: ; GFX906: ; %bb.0: -; GFX906: v_and_b32_e32 v2, 0x7fffffff, v2 -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] ; ; GFX950-LABEL: v_fdot2_abs_c: ; GFX950: ; %bb.0: @@ -348,13 +347,11 @@ define float @v_fdot2_abs_c(<2 x half> %a, <2 x half> %b, float %c) { ; ; GFX1170-LABEL: v_fdot2_abs_c: ; GFX1170: ; %bb.0: -; GFX1170: v_and_b32_e32 v2, 0x7fffffff, v2 -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] ; ; GFX12-LABEL: v_fdot2_abs_c: ; GFX12: ; %bb.0: -; GFX12: v_and_b32_e32 v2, 0x7fffffff, v2 -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] %abs.c = call float @llvm.fabs.f32(float %c) %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %abs.c, i1 false) ret float %r @@ -637,7 +634,7 @@ define float @v_fdot2_neg_b_clamp(<2 x half> %a, <2 x half> %b, float %c) { define float @v_fdot2_neg_c_clamp(<2 x half> %a, <2 x half> %b, float %c) { ; GCN-LABEL: v_fdot2_neg_c_clamp: ; GCN: ; %bb.0: -; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] clamp +; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] clamp %neg.c = fneg float %c %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 true) ret float %r @@ -646,8 +643,7 @@ define float @v_fdot2_neg_c_clamp(<2 x half> %a, <2 x half> %b, float %c) { define float @v_fdot2_abs_c_clamp(<2 x half> %a, <2 x half> %b, float %c) { ; GCN-LABEL: v_fdot2_abs_c_clamp: ; GCN: ; %bb.0: -; GCN: v_and_b32_e32 v2, 0x7fffffff, v2 -; GCN: v_dot2_f32_f16 v0, v0, v1, v2 clamp +; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] clamp %abs.c = call float @llvm.fabs.f32(float %c) %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %abs.c, i1 true) ret float %r @@ -1191,7 +1187,7 @@ define float @v_fdot2_neg_b_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_neg_c_dual: ; GFX906: ; %bb.0: -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] ; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 ; GFX906: v_add_f32_e32 v0, v0, v1 ; @@ -1217,13 +1213,13 @@ define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha ; ; GFX1170-LABEL: v_fdot2_neg_c_dual: ; GFX1170: ; %bb.0: -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] ; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 ; GFX1170: v_add_f32_e32 v0, v0, v1 ; ; GFX12-LABEL: v_fdot2_neg_c_dual: ; GFX12: ; %bb.0: -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] ; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 ; GFX12: v_add_f32_e32 v0, v0, v1 %neg.c = fneg float %c @@ -1236,8 +1232,7 @@ define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha define float @v_fdot2_abs_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) { ; GFX906-LABEL: v_fdot2_abs_c_dual: ; GFX906: ; %bb.0: -; GFX906: v_and_b32_e32 v2, 0x7fffffff, v2 -; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] ; GFX906: v_dot2_f32_f16 v1, v3, v4, v5 ; GFX906: v_add_f32_e32 v0, v0, v1 ; @@ -1263,15 +1258,13 @@ define float @v_fdot2_abs_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha ; ; GFX1170-LABEL: v_fdot2_abs_c_dual: ; GFX1170: ; %bb.0: -; GFX1170: v_and_b32_e32 v2, 0x7fffffff, v2 -; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] ; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5 ; GFX1170: v_add_f32_e32 v0, v0, v1 ; ; GFX12-LABEL: v_fdot2_abs_c_dual: ; GFX12: ; %bb.0: -; GFX12: v_and_b32_e32 v2, 0x7fffffff, v2 -; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] ; GFX12: v_dot2_f32_f16 v1, v3, v4, v5 ; GFX12: v_add_f32_e32 v0, v0, v1 %abs.c = call float @llvm.fabs.f32(float %c) _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
