llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> Fixes #<!-- -->141931 --- Patch is 153.02 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142173.diff 6 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+31) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+1) - (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+10) - (modified) llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll (+1-6) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+515-610) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+550-649) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index c61c52ec5843e..ab3c316f76deb 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -756,6 +756,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // allows matching fneg (fabs x) patterns) setOperationAction(ISD::FABS, MVT::v2f16, Legal); + // Can do this in one BFI plus a constant materialize. + setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16}, Custom); + setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal); @@ -6088,6 +6091,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SADDSAT: case ISD::SSUBSAT: return splitBinaryVectorOp(Op, DAG); + case ISD::FCOPYSIGN: + return lowerFCOPYSIGN(Op, DAG); case ISD::MUL: return lowerMUL(Op, DAG); case ISD::SMULO: @@ -7115,6 +7120,32 @@ SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op, return DAG.getZExtOrTrunc(NewVal, DL, OpTy); } +SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { + SDValue Mag = Op.getOperand(0); + SDValue Sign = Op.getOperand(1); + + EVT MagVT = Mag.getValueType(); + EVT SignVT = Sign.getValueType(); + + assert(MagVT.isVector()); + + if (MagVT == SignVT) + return Op; + + assert(MagVT.getVectorNumElements() == 2); + + // fcopysign v2f16:mag, v2f32:sign -> + // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16) + + SDLoc SL(Op); + SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign); + SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32); + + SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16); + + return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16); +} + // Custom lowering for vector multiplications and s_mul_u64. SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index c42366a1c04c8..283f8136d352a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -149,6 +149,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const; SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const; + SDValue lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const; SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const; SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 2e2913d88cc54..28557ad516865 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2062,6 +2062,16 @@ def : GCNPat < >; } // End foreach fp16vt = [f16, bf16] + +foreach fp16vt = [v2f16, v2bf16] in { + +def : GCNPat < + (fcopysign fp16vt:$src0, fp16vt:$src1), + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src0, $src1) +>; + +} + /********** ================== **********/ /********** Immediate Patterns **********/ /********** ================== **********/ diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index 15b049d4d7563..021104114d796 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -36,17 +36,12 @@ define <2 x half> @test_pown_reduced_fast_v2f16_known_odd(<2 x half> %x, <2 x i3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-NEXT: v_pack_b32_f16 v1, v1, v2 ; GFX9-NEXT: v_pk_mul_f16 v1, v3, v1 -; GFX9-NEXT: v_bfi_b32 v2, s4, v1, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %y = or <2 x i32> %y.arg, <i32 1, i32 1> %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 26ea80a802f91..a5a36d7122f68 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -996,63 +996,34 @@ define amdgpu_ps i32 @s_copysign_v2bf16(<2 x bfloat> inreg %arg_mag, <2 x bfloat ; ; GFX8-LABEL: s_copysign_v2bf16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s2, 0x7fff +; GFX8-NEXT: s_mov_b32 s2, 0x7fff7fff ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_lshr_b32 s1, s1, 16 -; GFX8-NEXT: s_lshr_b32 s0, s0, 16 ; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_bfi_b32 v1, s2, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_copysign_v2bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: s_mov_b32 s2, 0x7fff7fff ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s1, s1, 16 -; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_copysign_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX10-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_copysign_v2bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-NEXT: s_lshr_b32 s1, s1, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %out = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %arg_mag, <2 x bfloat> %arg_sign) @@ -2313,62 +2284,28 @@ define <2 x bfloat> @v_copysign_v2bf16(<2 x bfloat> %mag, <2 x bfloat> %sign) { ; GFX8-LABEL: v_copysign_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_bfi_b32 v2, s4, v0, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_copysign_v2bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfi_b32 v2, s4, v0, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_copysign_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, v3, v2 -; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11TRUE16-LABEL: v_copysign_v2bf16: -; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v3 -; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v0, v1 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11FAKE16-LABEL: v_copysign_v2bf16: -; GFX11FAKE16: ; %bb.0: -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, v3, v2 -; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: v_copysign_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %mag, <2 x bfloat> %sign) ret <2 x bfloat> %result } @@ -4171,49 +4108,42 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> %ma ; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: @@ -4228,13 +4158,9 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> %ma ; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo -; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11TRUE16-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: @@ -4247,19 +4173,15 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> %ma ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2 -; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v3 +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: @@ -4272,19 +4194,13 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> %ma ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2 -; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v3 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %mag.trunc = fptrunc <2 x float> %mag to <2 x bfloat> %out = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %mag.trunc, <2 x bfloat> %sign) @@ -4333,89 +4249,83 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16(<2 x double> %m ; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f64_e32 v7, v[2:3] -; GFX8-NEXT: s_movk_i32 s8, 0x7fff -; GFX8-NEXT: v_cvt_f32_f64_e32 v8, v[0:1] +; GFX8-NEXT: v_cvt_f32_f64_e32 v7, v[0:1] +; GFX8-NEXT: v_cvt_f32_f64_e32 v8, v[2:3] ; GFX8-NEXT: v_cvt_f64_f32_e32 v[5:6], v7 ; GFX8-NEXT: v_and_b32_e32 v9, 1, v7 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v9 -; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, |v[5:6]| -; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[5:6] +; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[5:6]| +; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[5:6] ; GFX8-NEXT: v_cndmask_b32_e64 v5, -1, 1, s[6:7] ; GFX8-NEXT: v_add_u32_e64 v5, s[6:7], v7, v5 ; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v7, v5, v7, vcc ; GFX8-NEXT: v_bfe_u32 v5, v7, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v7 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, s8, v5 -; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v5, v7 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[5:6], v8 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[0:1] +; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, |v[5:6]| +; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[5:6] ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v8 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v3 -; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[5:6]| -; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc -; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[5:6] -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v3, -1, 1, s[6:7] -; GFX8-NEXT: v_add_u32_e64 v3, s[6:7], v8, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v9, v7, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, -1, 1,... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/142173 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits