Author: vangthao95 Date: 2025-12-22T09:05:35-08:00 New Revision: 0e91db465e06a377349f0d5c25a8318045e3aa0b
URL: https://github.com/llvm/llvm-project/commit/0e91db465e06a377349f0d5c25a8318045e3aa0b DIFF: https://github.com/llvm/llvm-project/commit/0e91db465e06a377349f0d5c25a8318045e3aa0b.diff LOG: Revert "[AMDGPU][GlobalISel] Add RegBankLegalize support for G_FMAD, G_FMA (#…" This reverts commit c471badd81a59f72820294e54c72c40922a38dcc. Added: Modified: llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-mul.ll llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.ll llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.ll llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fma.mir Removed: llvm/test/CodeGen/AMDGPU/GlobalISel/fmad.ll ################################################################################ diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index f21b87c8f92f0..cc31d7d5c55ac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -684,12 +684,10 @@ bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); assert(MRI.getType(Dst) == V2S16); unsigned Opc = MI.getOpcode(); - unsigned NumOps = MI.getNumOperands(); auto Flags = MI.getFlags(); - auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg()); - - if (NumOps == 2) { + if (MI.getNumOperands() == 2) { + auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg()); auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags); auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags); B.buildMergeLikeInstr(Dst, {Lo, Hi}); @@ -697,20 +695,11 @@ bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) { return true; } + assert(MI.getNumOperands() == 3); + auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg()); auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg()); - - if (NumOps == 3) { - auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags); - auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags); - B.buildMergeLikeInstr(Dst, {Lo, Hi}); - MI.eraseFromParent(); - return true; - } - - assert(NumOps == 4); - auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg()); - auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags); - auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags); + auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags); + auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags); B.buildMergeLikeInstr(Dst, {Lo, Hi}); MI.eraseFromParent(); return true; @@ -982,7 +971,6 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { return LLT::fixed_vector(2, 16); case SgprV2S32: case VgprV2S32: - case UniInVgprV2S32: return LLT::fixed_vector(2, 32); case SgprV4S32: case SgprV4S32_WF: @@ -1086,7 +1074,6 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case UniInVgprS32: case UniInVgprS64: case UniInVgprV2S16: - case UniInVgprV2S32: case UniInVgprV4S32: case UniInVgprB32: case UniInVgprB64: @@ -1222,7 +1209,6 @@ bool RegBankLegalizeHelper::applyMappingDst( case UniInVgprS32: case UniInVgprS64: case UniInVgprV2S16: - case UniInVgprV2S32: case UniInVgprV4S32: { assert(Ty == getTyFromID(MethodIDs[OpIdx])); assert(RB == SgprRB); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index dee8488855b7a..63135feb4ea16 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -120,8 +120,6 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return isAnyPtr(MRI.getType(Reg), 128) && MUI.isUniform(Reg); case UniV2S16: return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg); - case UniV2S32: - return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isUniform(Reg); case UniB32: return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg); case UniB64: @@ -162,8 +160,6 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, return isAnyPtr(MRI.getType(Reg), 128) && MUI.isDivergent(Reg); case DivV2S16: return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg); - case DivV2S32: - return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isDivergent(Reg); case DivB32: return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg); case DivB64: @@ -972,30 +968,6 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat) .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat); - addRulesForGOpcs({G_FMAD}, Standard) - .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}}) - .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}}) - .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}}) - .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}}); - - addRulesForGOpcs({G_FMA}, Standard) - .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}}) - .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}}) - .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64, Vgpr64}}) - .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64, Vgpr64}}) - .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16, VgprV2S16}}) - .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32, VgprV2S32}}}) - .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32, VgprV2S32}}}) - .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16, Sgpr16}}, hasSALUFloat) - .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}}, !hasSALUFloat) - .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}}, hasSALUFloat) - .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}}, !hasSALUFloat) - .Uni(V2S16, - {{SgprV2S16}, {SgprV2S16, SgprV2S16, SgprV2S16}, ScalarizeToS16}, - hasSALUFloat) - .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16, VgprV2S16}}, - !hasSALUFloat); - // FNEG and FABS are either folded as source modifiers or can be selected as // bitwise XOR and AND with Mask. XOR and AND are available on SALU but for // targets without SALU float we still select them as VGPR since there would diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll index 48b6dd95bdc0d..b2b433167fe4d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s ; fold (fadd (fma x, y, (fpext (fmul u, v))), z) -> (fma x, y, (fma (fpext u), (fpext v), z)) define amdgpu_vs float @test_f16_f32_add_fma_ext_mul(float %x, float %y, float %z, half %u, half %v) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll index 21997e2224735..4d603f7487754 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-FAST-DENORM %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-FAST-DENORM %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-FAST-DENORM %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-FAST-DENORM %s ; fold (fadd fast (fpext (fmul fast x, y)), z) -> (fma (fpext x), (fpext y), z) ; fold (fadd fast x, (fpext (fmul fast y, z))) -> (fma (fpext y), (fpext z), x) @@ -49,26 +49,21 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x, ; GFX9-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul: ; GFX9-FAST-DENORM: ; %bb.0: ; %.entry ; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s0, v0 -; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s1, v0 -; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s2, v0 -; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-FAST-DENORM-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-FAST-DENORM-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v1, s3 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v2, s1 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, s4 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v4, s2 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s6, v0 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v1, s7, v1 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v2, s8, v2 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s9, v3 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v4, s10, v4 +; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s1, v1 +; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2 +; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s6, v3 +; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v1, s7, v4 +; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v2, s8, v5 +; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s9, v6 +; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v4, s10, v7 ; GFX9-FAST-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul: @@ -99,29 +94,23 @@ define amdgpu_vs <6 x float> @test_6xf16_6xf32_add_ext_mul_rhs(<6 x half> inreg ; GFX9-FAST-DENORM-LABEL: test_6xf16_6xf32_add_ext_mul_rhs: ; GFX9-FAST-DENORM: ; %bb.0: ; %.entry ; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s0, v0 -; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s1, v0 -; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s2, v0 -; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-FAST-DENORM-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-FAST-DENORM-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-FAST-DENORM-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v1, s3 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v2, s1 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, s4 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v4, s2 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, s5 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s6, v0 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v1, s7, v1 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v2, s8, v2 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s9, v3 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v4, s10, v4 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v5, s11, v5 +; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s1, v1 +; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2 +; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s6, v3 +; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v1, s7, v4 +; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v2, s8, v5 +; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s9, v6 +; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v4, s10, v7 +; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v5, s11, v8 ; GFX9-FAST-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-FAST-DENORM-LABEL: test_6xf16_6xf32_add_ext_mul_rhs: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll index 8183a4dec10ca..6ea0a9446ff9d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-CONTRACT %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GFX11-DENORM %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-CONTRACT %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GFX11-DENORM %s ; fadd (fma a, b, (fmul c, d)), e --> fma a, b, (fma c, d, e) ; fadd e, (fma a, b, (fmul c, d)) --> fma a, b, (fma c, d, e) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll index 1e02f6308a0c5..3f6e3d81c52ad 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s define float @test_f32_add_mul(float %x, float %y, float %z) { ; GFX9-LABEL: test_f32_add_mul: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-mul.ll index 8879f7dc2b44c..4d6e60cbf6977 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-mul.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s ; fold (fsub (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), (fneg z)) define amdgpu_vs float @test_f16_to_f32_sub_ext_mul(half %x, half %y, float %z) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll index df6c8dffba5ef..814a34754e883 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s ; fold (fsub (fpext (fneg (fmul, x, y))), z) -> (fneg (fma (fpext x), (fpext y), z)) define amdgpu_vs float @test_f16_to_f32_sub_ext_neg_mul(half %x, half %y, float %z) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll index d046b854fb0d8..99bdcdd1f31e5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-CONTRACT %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GFX11-DENORM %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-CONTRACT %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=GFX11-DENORM %s ; fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) ; fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll index c0a828ecacbae..70f961e2777af 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s ; fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) define float @test_f32_sub_ext_neg_mul(float %x, float %y, float %z) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll index 067704cfb4d80..0b09cabf25a16 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -1,12 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define float @v_fma_f32(float %x, float %y, float %z) { ; GFX6-LABEL: v_fma_f32: @@ -27,12 +25,6 @@ define float @v_fma_f32(float %x, float %y, float %z) { ; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_f32: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -44,16 +36,6 @@ define float @v_fma_f32(float %x, float %y, float %z) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX12-NEXT: s_setpc_b64 s[30:31] %fma = call float @llvm.fma.f32(float %x, float %y, float %z) ret float %fma } @@ -80,12 +62,6 @@ define <2 x float> @v_fma_v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z) ; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_v2f32: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_v2f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -99,17 +75,6 @@ define <2 x float> @v_fma_v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z) ; GFX11-NEXT: v_fma_f32 v0, v0, v2, v4 ; GFX11-NEXT: v_fma_f32 v1, v1, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_v2f32: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_fma_f32 v0, v0, v2, v4 -; GFX12-NEXT: v_fma_f32 v1, v1, v3, v5 -; GFX12-NEXT: s_setpc_b64 s[30:31] %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z) ret <2 x float> %fma } @@ -137,12 +102,6 @@ define half @v_fma_f16(half %x, half %y, half %z) { ; GFX9-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_f16: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_fma_f16 v0, v0, v1, v2 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -161,16 +120,6 @@ define half @v_fma_f16(half %x, half %y, half %z) { ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_fma_f16 v0, v0, v1, v2 -; GFX12-NEXT: s_setpc_b64 s[30:31] %fma = call half @llvm.fma.f16(half %x, half %y, half %z) ret half %fma } @@ -198,12 +147,6 @@ define half @v_fma_f16_fneg_lhs(half %x, half %y, half %z) { ; GFX9-NEXT: v_fma_f16 v0, -v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_f16_fneg_lhs: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_fma_f16 v0, -v0, v1, v2 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_f16_fneg_lhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -221,16 +164,6 @@ define half @v_fma_f16_fneg_lhs(half %x, half %y, half %z) { ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_fma_f16 v0, -v0, v1, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_f16_fneg_lhs: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_fma_f16 v0, -v0, v1, v2 -; GFX12-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg half %x %fma = call half @llvm.fma.f16(half %neg.x, half %y, half %z) ret half %fma @@ -259,12 +192,6 @@ define half @v_fma_f16_fneg_rhs(half %x, half %y, half %z) { ; GFX9-NEXT: v_fma_f16 v0, v0, -v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_f16_fneg_rhs: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_fma_f16 v0, v0, -v1, v2 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_f16_fneg_rhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -282,16 +209,6 @@ define half @v_fma_f16_fneg_rhs(half %x, half %y, half %z) { ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, -v1, v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_f16_fneg_rhs: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_fma_f16 v0, v0, -v1, v2 -; GFX12-NEXT: s_setpc_b64 s[30:31] %neg.y = fneg half %y %fma = call half @llvm.fma.f16(half %x, half %neg.y, half %z) ret half %fma @@ -320,12 +237,6 @@ define half @v_fma_f16_fneg_add(half %x, half %y, half %z) { ; GFX9-NEXT: v_fma_f16 v0, v0, v1, -v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_f16_fneg_add: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_fma_f16 v0, v0, v1, -v2 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_f16_fneg_add: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -343,16 +254,6 @@ define half @v_fma_f16_fneg_add(half %x, half %y, half %z) { ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, -v2 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_f16_fneg_add: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_fma_f16 v0, v0, v1, -v2 -; GFX12-NEXT: s_setpc_b64 s[30:31] %neg.z = fneg half %z %fma = call half @llvm.fma.f16(half %x, half %y, half %neg.z) ret half %fma @@ -392,12 +293,6 @@ define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) { ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_v2f16: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_fma_f16 v0, v0, v1, v2 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -409,16 +304,6 @@ define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_fma_f16 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_v2f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_pk_fma_f16 v0, v0, v1, v2 -; GFX12-NEXT: s_setpc_b64 s[30:31] %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) ret <2 x half> %fma } @@ -463,12 +348,6 @@ define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half> ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_v2f16_fneg_lhs: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_v2f16_fneg_lhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -480,16 +359,6 @@ define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half> ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_v2f16_fneg_lhs: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x.fneg, <2 x half> %y, <2 x half> %z) ret <2 x half> %fma @@ -535,12 +404,6 @@ define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half> ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_v2f16_fneg_rhs: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_v2f16_fneg_rhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -552,16 +415,6 @@ define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half> ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_v2f16_fneg_rhs: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: s_setpc_b64 s[30:31] %y.fneg = fneg <2 x half> %y %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x, <2 x half> %y.fneg, <2 x half> %z) ret <2 x half> %fma @@ -601,12 +454,6 @@ define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x h ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_v2f16_fneg_lhs_rhs: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_fma_f16 v0, v0, v1, v2 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_v2f16_fneg_lhs_rhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -618,16 +465,6 @@ define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x h ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_fma_f16 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_v2f16_fneg_lhs_rhs: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_pk_fma_f16 v0, v0, v1, v2 -; GFX12-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %y.fneg = fneg <2 x half> %y %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x.fneg, <2 x half> %y.fneg, <2 x half> %z) @@ -675,13 +512,6 @@ define <3 x half> @v_fma_v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) { ; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_v3f16: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX90A-NEXT: v_pk_fma_f16 v1, v1, v3, v5 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_v3f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -695,17 +525,6 @@ define <3 x half> @v_fma_v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) { ; GFX11-NEXT: v_pk_fma_f16 v0, v0, v2, v4 ; GFX11-NEXT: v_pk_fma_f16 v1, v1, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_v3f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX12-NEXT: v_pk_fma_f16 v1, v1, v3, v5 -; GFX12-NEXT: s_setpc_b64 s[30:31] %fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) ret <3 x half> %fma } @@ -762,13 +581,6 @@ define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) { ; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_v4f16: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX90A-NEXT: v_pk_fma_f16 v1, v1, v3, v5 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_v4f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -782,17 +594,6 @@ define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) { ; GFX11-NEXT: v_pk_fma_f16 v0, v0, v2, v4 ; GFX11-NEXT: v_pk_fma_f16 v1, v1, v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_v4f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX12-NEXT: v_pk_fma_f16 v1, v1, v3, v5 -; GFX12-NEXT: s_setpc_b64 s[30:31] %fma = call <4 x half> @llvm.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) ret <4 x half> %fma } @@ -816,14 +617,6 @@ define double @v_fma_f64(double %x, double %y, double %z) { ; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_f64: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_fmac_f64_e32 v[4:5], v[0:1], v[2:3] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -835,16 +628,6 @@ define double @v_fma_f64(double %x, double %y, double %z) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_f64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX12-NEXT: s_setpc_b64 s[30:31] %fma = call double @llvm.fma.f64(double %x, double %y, double %z) ret double %fma } @@ -868,12 +651,6 @@ define double @v_fma_f64_fneg_all(double %x, double %y, double %z) { ; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_f64_fneg_all: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_f64_fneg_all: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -885,16 +662,6 @@ define double @v_fma_f64_fneg_all(double %x, double %y, double %z) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5] ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_f64_fneg_all: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5] -; GFX12-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg double %x %neg.y = fneg double %y %neg.z = fneg double %z @@ -924,17 +691,6 @@ define <2 x double> @v_fma_v2f64(<2 x double> %x, <2 x double> %y, <2 x double> ; GFX9-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_v2f64: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_fmac_f64_e32 v[8:9], v[0:1], v[4:5] -; GFX90A-NEXT: v_fmac_f64_e32 v[10:11], v[2:3], v[6:7] -; GFX90A-NEXT: v_mov_b32_e32 v0, v8 -; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: v_mov_b32_e32 v2, v10 -; GFX90A-NEXT: v_mov_b32_e32 v3, v11 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_v2f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -948,17 +704,6 @@ define <2 x double> @v_fma_v2f64(<2 x double> %x, <2 x double> %y, <2 x double> ; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] ; GFX11-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_v2f64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] -; GFX12-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] -; GFX12-NEXT: s_setpc_b64 s[30:31] %fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) ret <2 x double> %fma } @@ -982,12 +727,6 @@ define float @v_fma_f32_fabs_lhs(float %x, float %y, float %z) { ; GFX9-NEXT: v_fma_f32 v0, |v0|, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_f32_fabs_lhs: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_fma_f32 v0, |v0|, v1, v2 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_f32_fabs_lhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -999,16 +738,6 @@ define float @v_fma_f32_fabs_lhs(float %x, float %y, float %z) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fma_f32 v0, |v0|, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_f32_fabs_lhs: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_fma_f32 v0, |v0|, v1, v2 -; GFX12-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %fma = call float @llvm.fma.f32(float %fabs.x, float %y, float %z) ret float %fma @@ -1033,12 +762,6 @@ define float @v_fma_f32_fabs_rhs(float %x, float %y, float %z) { ; GFX9-NEXT: v_fma_f32 v0, v0, |v1|, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_f32_fabs_rhs: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_fma_f32 v0, v0, |v1|, v2 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_f32_fabs_rhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1050,16 +773,6 @@ define float @v_fma_f32_fabs_rhs(float %x, float %y, float %z) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fma_f32 v0, v0, |v1|, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_f32_fabs_rhs: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_fma_f32 v0, v0, |v1|, v2 -; GFX12-NEXT: s_setpc_b64 s[30:31] %fabs.y = call float @llvm.fabs.f32(float %y) %fma = call float @llvm.fma.f32(float %x, float %fabs.y, float %z) ret float %fma @@ -1084,12 +797,6 @@ define float @v_fma_f32_fabs_lhs_rhs(float %x, float %y, float %z) { ; GFX9-NEXT: v_fma_f32 v0, |v0|, |v1|, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_f32_fabs_lhs_rhs: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_fma_f32 v0, |v0|, |v1|, v2 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_f32_fabs_lhs_rhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1101,16 +808,6 @@ define float @v_fma_f32_fabs_lhs_rhs(float %x, float %y, float %z) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fma_f32 v0, |v0|, |v1|, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_f32_fabs_lhs_rhs: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_fma_f32 v0, |v0|, |v1|, v2 -; GFX12-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %fabs.y = call float @llvm.fabs.f32(float %y) %fma = call float @llvm.fma.f32(float %fabs.x, float %fabs.y, float %z) @@ -1133,11 +830,6 @@ define amdgpu_ps float @v_fma_f32_sgpr_vgpr_vgpr(float inreg %x, float %y, float ; GFX9-NEXT: v_fma_f32 v0, s0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX90A-LABEL: v_fma_f32_sgpr_vgpr_vgpr: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: v_fma_f32 v0, s0, v0, v1 -; GFX90A-NEXT: ; return to shader part epilog -; ; GFX10-LABEL: v_fma_f32_sgpr_vgpr_vgpr: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_fma_f32 v0, s0, v0, v1 @@ -1147,11 +839,6 @@ define amdgpu_ps float @v_fma_f32_sgpr_vgpr_vgpr(float inreg %x, float %y, float ; GFX11: ; %bb.0: ; GFX11-NEXT: v_fma_f32 v0, s0, v0, v1 ; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: v_fma_f32_sgpr_vgpr_vgpr: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_fma_f32 v0, s0, v0, v1 -; GFX12-NEXT: ; return to shader part epilog %fma = call float @llvm.fma.f32(float %x, float %y, float %z) ret float %fma } @@ -1172,11 +859,6 @@ define amdgpu_ps float @v_fma_f32_vgpr_sgpr_vgpr(float %x, float inreg %y, float ; GFX9-NEXT: v_fma_f32 v0, v0, s0, v1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX90A-LABEL: v_fma_f32_vgpr_sgpr_vgpr: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: v_fma_f32 v0, s0, v0, v1 -; GFX90A-NEXT: ; return to shader part epilog -; ; GFX10-LABEL: v_fma_f32_vgpr_sgpr_vgpr: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_fma_f32 v0, s0, v0, v1 @@ -1186,11 +868,6 @@ define amdgpu_ps float @v_fma_f32_vgpr_sgpr_vgpr(float %x, float inreg %y, float ; GFX11: ; %bb.0: ; GFX11-NEXT: v_fma_f32 v0, s0, v0, v1 ; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: v_fma_f32_vgpr_sgpr_vgpr: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_fma_f32 v0, s0, v0, v1 -; GFX12-NEXT: ; return to shader part epilog %fma = call float @llvm.fma.f32(float %x, float %y, float %z) ret float %fma } @@ -1217,13 +894,6 @@ define amdgpu_ps float @v_fma_f32_sgpr_sgpr_sgpr(float inreg %x, float inreg %y, ; GFX9-NEXT: v_fma_f32 v0, s0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX90A-LABEL: v_fma_f32_sgpr_sgpr_sgpr: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NEXT: v_fma_f32 v0, s0, v0, v1 -; GFX90A-NEXT: ; return to shader part epilog -; ; GFX10-LABEL: v_fma_f32_sgpr_sgpr_sgpr: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v0, s2 @@ -1235,12 +905,6 @@ define amdgpu_ps float @v_fma_f32_sgpr_sgpr_sgpr(float inreg %x, float inreg %y, ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_fma_f32 v0, s1, s0, v0 ; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: v_fma_f32_sgpr_sgpr_sgpr: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_fmac_f32 s2, s0, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: ; return to shader part epilog %fma = call float @llvm.fma.f32(float %x, float %y, float %z) ret float %fma } @@ -1264,12 +928,6 @@ define float @v_fma_f32_fneg_lhs(float %x, float %y, float %z) { ; GFX9-NEXT: v_fma_f32 v0, -v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_f32_fneg_lhs: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_fma_f32 v0, -v0, v1, v2 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_f32_fneg_lhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1281,16 +939,6 @@ define float @v_fma_f32_fneg_lhs(float %x, float %y, float %z) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fma_f32 v0, -v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_f32_fneg_lhs: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_fma_f32 v0, -v0, v1, v2 -; GFX12-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg float %x %fma = call float @llvm.fma.f32(float %neg.x, float %y, float %z) ret float %fma @@ -1315,12 +963,6 @@ define float @v_fma_f32_fneg_rhs(float %x, float %y, float %z) { ; GFX9-NEXT: v_fma_f32 v0, v0, -v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_f32_fneg_rhs: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_fma_f32 v0, v0, -v1, v2 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_f32_fneg_rhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1332,16 +974,6 @@ define float @v_fma_f32_fneg_rhs(float %x, float %y, float %z) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fma_f32 v0, v0, -v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_f32_fneg_rhs: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_fma_f32 v0, v0, -v1, v2 -; GFX12-NEXT: s_setpc_b64 s[30:31] %neg.y = fneg float %y %fma = call float @llvm.fma.f32(float %x, float %neg.y, float %z) ret float %fma @@ -1366,12 +998,6 @@ define float @v_fma_f32_fneg_z(float %x, float %y, float %z) { ; GFX9-NEXT: v_fma_f32 v0, v0, v1, -v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: v_fma_f32_fneg_z: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_fma_f32 v0, v0, v1, -v2 -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_fma_f32_fneg_z: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1383,16 +1009,6 @@ define float @v_fma_f32_fneg_z(float %x, float %y, float %z) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fma_f32 v0, v0, v1, -v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_fma_f32_fneg_z: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_fma_f32 v0, v0, v1, -v2 -; GFX12-NEXT: s_setpc_b64 s[30:31] %neg.z = fneg float %z %fma = call float @llvm.fma.f32(float %x, float %y, float %neg.z) ret float %fma @@ -1414,11 +1030,6 @@ define amdgpu_ps float @dont_crash_after_fma_mix_select_attempt(float inreg %x, ; GFX9-NEXT: v_fma_f32 v0, |s0|, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX90A-LABEL: dont_crash_after_fma_mix_select_attempt: -; GFX90A: ; %bb.0: ; %.entry -; GFX90A-NEXT: v_fma_f32 v0, |s0|, v0, v1 -; GFX90A-NEXT: ; return to shader part epilog -; ; GFX10-LABEL: dont_crash_after_fma_mix_select_attempt: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: v_fma_f32 v0, |s0|, v0, v1 @@ -1428,331 +1039,12 @@ define amdgpu_ps float @dont_crash_after_fma_mix_select_attempt(float inreg %x, ; GFX11: ; %bb.0: ; %.entry ; GFX11-NEXT: v_fma_f32 v0, |s0|, v0, v1 ; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: dont_crash_after_fma_mix_select_attempt: -; GFX12: ; %bb.0: ; %.entry -; GFX12-NEXT: v_fma_f32 v0, |s0|, v0, v1 -; GFX12-NEXT: ; return to shader part epilog .entry: %fabs.x = call contract float @llvm.fabs.f32(float %x) %fma = call float @llvm.fma.f32(float %fabs.x, float %y, float %z) ret float %fma } -define amdgpu_ps half @fma_s16_uniform(half inreg %a, half inreg %b, half inreg %c) { -; GFX6-LABEL: fma_s16_uniform: -; GFX6: ; %bb.0: -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, s2 -; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: fma_s16_uniform: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_fma_f16 v0, s0, v0, v1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: fma_s16_uniform: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_fma_f16 v0, s0, v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX90A-LABEL: fma_s16_uniform: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NEXT: v_fma_f16 v0, s0, v0, v1 -; GFX90A-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: fma_s16_uniform: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_fma_f16 v0, s1, s0, v0 -; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-TRUE16-LABEL: fma_s16_uniform: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: v_fmac_f16_e64 v0.l, s0, s1 -; GFX11-TRUE16-NEXT: ; return to shader part epilog -; -; GFX11-FAKE16-LABEL: fma_s16_uniform: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-FAKE16-NEXT: v_fma_f16 v0, s1, s0, v0 -; GFX11-FAKE16-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: fma_s16_uniform: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_fmac_f16 s2, s0, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: ; return to shader part epilog - %fma = call half @llvm.fma.f16(half %a, half %b, half %c) - ret half %fma -} - -define amdgpu_ps float @fma_s32_uniform(float inreg %a, float inreg %b, float inreg %c) { -; GFX6-LABEL: fma_s32_uniform: -; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_fma_f32 v0, s0, v0, v1 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: fma_s32_uniform: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_fma_f32 v0, s0, v0, v1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: fma_s32_uniform: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_fma_f32 v0, s0, v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX90A-LABEL: fma_s32_uniform: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NEXT: v_fma_f32 v0, s0, v0, v1 -; GFX90A-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: fma_s32_uniform: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_fma_f32 v0, s1, s0, v0 -; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: fma_s32_uniform: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_fma_f32 v0, s1, s0, v0 -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: fma_s32_uniform: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_fmac_f32 s2, s0, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-NEXT: ; return to shader part epilog - %fma = call float @llvm.fma.f32(float %a, float %b, float %c) - ret float %fma -} - -define amdgpu_ps void @fma_s64_uniform(double inreg %a, double inreg %b, double inreg %c, ptr addrspace(1) %ptr) { -; GFX6-LABEL: fma_s64_uniform: -; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NEXT: v_mov_b32_e32 v5, s5 -; GFX6-NEXT: v_fma_f64 v[2:3], s[0:1], v[2:3], v[4:5] -; GFX6-NEXT: s_mov_b32 s2, 0 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b64 s[0:1], 0 -; GFX6-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: s_endpgm -; -; GFX8-LABEL: fma_s64_uniform: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_fma_f64 v[2:3], s[0:1], v[2:3], v[4:5] -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: fma_s64_uniform: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_fma_f64 v[2:3], s[0:1], v[2:3], v[4:5] -; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GFX9-NEXT: s_endpgm -; -; GFX90A-LABEL: fma_s64_uniform: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_fmac_f64_e32 v[4:5], s[0:1], v[2:3] -; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[4:5], off -; GFX90A-NEXT: s_endpgm -; -; GFX10-LABEL: fma_s64_uniform: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_fma_f64 v[2:3], s[0:1], s[2:3], v[2:3] -; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: fma_s64_uniform: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX11-NEXT: v_fma_f64 v[2:3], s[0:1], s[2:3], v[2:3] -; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX11-NEXT: s_endpgm -; -; GFX12-LABEL: fma_s64_uniform: -; GFX12: ; %bb.0: -; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: v_fma_f64 v[2:3], s[0:1], s[2:3], v[2:3] -; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX12-NEXT: s_endpgm - %fma = call double @llvm.fma.f64(double %a, double %b, double %c) - store double %fma, ptr addrspace(1) %ptr - ret void -} - -define amdgpu_ps <2 x half> @fma_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b, <2 x half> inreg %c) { -; GFX6-LABEL: fma_v2s16_uniform: -; GFX6: ; %bb.0: -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, s4 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, s1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, s3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v5, s5 -; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_fma_f32 v1, v3, v4, v5 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: fma_v2s16_uniform: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_lshr_b32 s5, s2, 16 -; GFX8-NEXT: v_fma_f16 v0, s0, v0, v1 -; GFX8-NEXT: s_lshr_b32 s3, s0, 16 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_fma_f16 v0, s3, v0, v1 -; GFX8-NEXT: v_readfirstlane_b32 s1, v0 -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: fma_v2s16_uniform: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_pk_fma_f16 v0, s0, v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX90A-LABEL: fma_v2s16_uniform: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NEXT: v_pk_fma_f16 v0, s0, v0, v1 -; GFX90A-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: fma_v2s16_uniform: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_pk_fma_f16 v0, s0, s1, v0 -; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: fma_v2s16_uniform: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_pk_fma_f16 v0, s0, s1, v0 -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: fma_v2s16_uniform: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_lshr_b32 s3, s0, 16 -; GFX12-NEXT: s_lshr_b32 s4, s1, 16 -; GFX12-NEXT: s_lshr_b32 s5, s2, 16 -; GFX12-NEXT: s_fmac_f16 s2, s0, s1 -; GFX12-NEXT: s_fmac_f16 s5, s3, s4 -; GFX12-NEXT: s_pack_ll_b32_b16 s0, s2, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-NEXT: ; return to shader part epilog - %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) - ret <2 x half> %fma -} - -define amdgpu_ps <2 x float> @fma_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b, <2 x float> inreg %c) { -; GFX6-LABEL: fma_v2s32_uniform: -; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: v_fma_f32 v0, s0, v0, v1 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NEXT: v_fma_f32 v1, s1, v1, v2 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: fma_v2s32_uniform: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_fma_f32 v0, s0, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_fma_f32 v1, s1, v1, v2 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: fma_v2s32_uniform: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_fma_f32 v0, s0, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_fma_f32 v1, s1, v1, v2 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX90A-LABEL: fma_v2s32_uniform: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_pk_fma_f32 v[0:1], s[0:1], v[0:1], v[2:3] -; GFX90A-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: fma_v2s32_uniform: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_fma_f32 v0, s2, s0, v0 -; GFX10-NEXT: v_fma_f32 v1, s3, s1, v1 -; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: fma_v2s32_uniform: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_fma_f32 v0, s2, s0, v0 -; GFX11-NEXT: v_fma_f32 v1, s3, s1, v1 -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: fma_v2s32_uniform: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_fmac_f32 s4, s0, s2 -; GFX12-NEXT: s_fmac_f32 s5, s1, s3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: ; return to shader part epilog - %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) - ret <2 x float> %fma -} - declare half @llvm.fma.f16(half, half, half) #0 declare float @llvm.fma.f32(float, float, float) #0 declare double @llvm.fma.f64(double, double, double) #0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmad.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmad.ll deleted file mode 100644 index 4907ee16a4978..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmad.ll +++ /dev/null @@ -1,95 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji --denormal-fp-math=preserve-sign -o - %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 --denormal-fp-math=preserve-sign -o - %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 --denormal-fp-math=preserve-sign -o - %s | FileCheck -check-prefix=GFX10 %s - -define amdgpu_ps float @fmad_s32_uniform(float inreg %a, float inreg %b, float inreg %c) { -; GFX8-LABEL: fmad_s32_uniform: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mad_f32 v0, s0, v0, v1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: fmad_s32_uniform: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mad_f32 v0, s0, v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: fmad_s32_uniform: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mad_f32 v0, s1, s0, v0 -; GFX10-NEXT: ; return to shader part epilog - %mul = fmul float %a, %b - %result = fadd float %mul, %c - ret float %result -} - -define amdgpu_ps float @fmad_s32_div(float %a, float %b, float %c) { -; GFX8-LABEL: fmad_s32_div: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mad_f32 v0, v0, v1, v2 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: fmad_s32_div: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mad_f32 v0, v0, v1, v2 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: fmad_s32_div: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mad_f32 v0, v0, v1, v2 -; GFX10-NEXT: ; return to shader part epilog - %mul = fmul float %a, %b - %result = fadd float %mul, %c - ret float %result -} - -define amdgpu_ps half @fmad_s16_uniform(half inreg %a, half inreg %b, half inreg %c) { -; GFX8-LABEL: fmad_s16_uniform: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mad_f16 v0, s0, v0, v1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: fmad_s16_uniform: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mad_legacy_f16 v0, s0, v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: fmad_s16_uniform: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mul_f16_e64 v0, s0, s1 -; GFX10-NEXT: v_add_f16_e32 v0, s2, v0 -; GFX10-NEXT: ; return to shader part epilog - %mul = fmul half %a, %b - %result = fadd half %mul, %c - ret half %result -} - -define amdgpu_ps half @fmad_s16_div(half %a, half %b, half %c) { -; GFX8-LABEL: fmad_s16_div: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mad_f16 v0, v0, v1, v2 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: fmad_s16_div: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mad_legacy_f16 v0, v0, v1, v2 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: fmad_s16_div: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX10-NEXT: v_add_f16_e32 v0, v0, v2 -; GFX10-NEXT: ; return to shader part epilog - %mul = fmul half %a, %b - %result = fadd half %mul, %c - ret half %result -} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.ll index cc2a8ee11f180..dc4545bd82ae2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmamix-constant-bus-violation.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx908 %s -o - | FileCheck %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 %s -o - | FileCheck %s define float @test_fmamix_constant_bus_violation_sss(i32 inreg %val.0, i32 inreg %val.1, i32 inreg %val.2) #0 { ; CHECK-LABEL: test_fmamix_constant_bus_violation_sss: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll index 1e7c7dcb620a5..1220c0e3b1ead 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s + +; TODO: Switch test to use -new-reg-bank-select after adding G_FNEG support. define <2 x half> @v_fmul_v2f16(<2 x half> %a, <2 x half> %b) { ; GFX9-LABEL: v_fmul_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.ll index 52425323332dd..2351bf2d6e876 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/madmix-constant-bus-violation.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 %s -o - | FileCheck %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 %s -o - | FileCheck %s define float @test_fmamix_constant_bus_violation_sss(i32 inreg %val.0, i32 inreg %val.1, i32 inreg %val.2) #0 { ; CHECK-LABEL: test_fmamix_constant_bus_violation_sss: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fma.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fma.mir index 9dfc7700e2c80..d63fc07ada772 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fma.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s --- name: fma_sss @@ -14,7 +15,10 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; CHECK-NEXT: [[FMA:%[0-9]+]]:sgpr(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY3]], [[COPY4]], [[COPY5]] %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 %2:_(s32) = COPY $sgpr2 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
