https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142156
>From 158179f7aba2fcdc96091da39f33ad99fd040af6 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Fri, 30 May 2025 12:03:35 +0200 Subject: [PATCH] AMDGPU: Handle vectors in copysign magnitude sign case --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 58 +++- .../AMDGPU/copysign-simplify-demanded-bits.ll | 2 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 294 +++++++----------- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 179 +++++------ 4 files changed, 242 insertions(+), 291 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 74ca3e43fce3a..af85c6bef273d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11721,29 +11721,63 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, DAGCombinerInfo &DCI) const { SDValue MagnitudeOp = N->getOperand(0); SDValue SignOp = N->getOperand(1); + + // The generic combine for fcopysign + fp cast is too conservative with + // vectors, and also gets confused by the splitting we will perform here, so + // peek through FP casts. + if (SignOp.getOpcode() == ISD::FP_EXTEND || + SignOp.getOpcode() == ISD::FP_ROUND) + SignOp = SignOp.getOperand(0); + SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); + EVT SignVT = SignOp.getValueType(); // f64 fcopysign is really an f32 copysign on the high bits, so replace the // lower half with a copy. // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) - if (MagnitudeOp.getValueType() == MVT::f64) { - SDValue MagAsVector = - DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp); - SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, - MagAsVector, DAG.getConstant(0, DL, MVT::i32)); - SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, - MagAsVector, DAG.getConstant(1, DL, MVT::i32)); + EVT MagVT = MagnitudeOp.getValueType(); + if (MagVT.getScalarType() == MVT::f64) { + unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + + EVT F32VT = MagVT.isVector() + ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) + : MVT::v2f32; + + SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp); + + SmallVector<SDValue, 8> NewElts; + for (unsigned I = 0; I != NumElts; ++I) { + SDValue MagLo = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, + DAG.getConstant(2 * I, DL, MVT::i32)); + SDValue MagHi = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, + DAG.getConstant(2 * I + 1, DL, MVT::i32)); - SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp); + SDValue SignOpElt = + MagVT.isVector() + ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SignVT.getScalarType(), + SignOp, DAG.getConstant(I, DL, MVT::i32)) + : SignOp; + + SDValue HiOp = + DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt); + + SDValue Vector = + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp); + + SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector); + NewElts.push_back(NewElt); + } - SDValue Vector = - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp); + if (NewElts.size() == 1) + return NewElts[0]; - return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector); + return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts); } - if (SignOp.getValueType() != MVT::f64) + if (SignVT != MVT::f64) return SDValue(); // Reduce width of sign operand, we only need the highest bit. diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index a01c2fa152ab3..15b049d4d7563 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -131,8 +131,8 @@ define <2 x double> @test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2 ; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v6, 1, v5 ; GFX9-NEXT: v_or_b32_e32 v4, 1, v4 +; GFX9-NEXT: v_or_b32_e32 v6, 1, v5 ; GFX9-NEXT: v_cvt_f64_i32_e32 v[4:5], v4 ; GFX9-NEXT: v_cvt_f64_i32_e32 v[6:7], v6 ; GFX9-NEXT: s_brev_b32 s4, -2 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 32e3f72af516f..3bd068362410b 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -4055,50 +4055,38 @@ define <2 x double> @v_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x double> %ma ; GCN-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 -; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; GCN-NEXT: v_bfi_b32 v1, s4, v1, v7 +; GCN-NEXT: v_bfi_b32 v1, s4, v1, v4 ; GCN-NEXT: v_bfi_b32 v3, s4, v3, v5 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 ; GFX7-NEXT: s_brev_b32 s4, -2 -; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v5 -; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v7 +; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v4 +; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v5 ; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v5 ; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4969,71 +4957,63 @@ define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f32_sign_v2bf16(<2 x floa define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x double> inreg %mag, <2 x bfloat> inreg %sign) { ; GCN-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s5 -; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_mov_b32_e32 v4, s3 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; GCN-NEXT: v_bfi_b32 v0, s4, v4, v3 -; GCN-NEXT: v_bfi_b32 v1, s4, v5, v1 +; GCN-NEXT: s_brev_b32 s6, -2 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_bfi_b32 v0, s6, v0, v1 +; GCN-NEXT: v_bfi_b32 v1, s6, v2, v3 ; GCN-NEXT: v_readfirstlane_b32 s1, v1 ; GCN-NEXT: v_readfirstlane_b32 s3, v0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s5 -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; GFX7-NEXT: s_brev_b32 s4, -2 +; GFX7-NEXT: s_brev_b32 s6, -2 ; GFX7-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_bfi_b32 v0, s6, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v3 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_bfi_b32 v1, s6, v1, v2 ; GFX7-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7-NEXT: v_readfirstlane_b32 s3, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s5, s4, 16 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, 16, s4 -; GFX8-NEXT: s_brev_b32 s4, -2 +; GFX8-NEXT: s_brev_b32 s5, -2 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_bfi_b32 v0, s4, v1, v0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, 16, s5 +; GFX8-NEXT: s_lshr_b32 s1, s4, 16 +; GFX8-NEXT: v_bfi_b32 v0, s5, v1, v0 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, 16, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX8-NEXT: v_bfi_b32 v1, s5, v2, v1 ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 ; GFX8-NEXT: v_readfirstlane_b32 s3, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s4 -; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: s_brev_b32 s5, -2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, 16, s5 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: v_bfi_b32 v0, s5, v1, v0 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, 16, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX9-NEXT: v_bfi_b32 v1, s5, v2, v1 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 ; GFX9-NEXT: v_readfirstlane_b32 s3, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s5, s4, 16 ; GFX10-NEXT: v_lshlrev_b32_e64 v0, 16, s4 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, 16, s5 +; GFX10-NEXT: s_lshr_b32 s4, s4, 16 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, 16, s4 ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 ; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1 ; GFX10-NEXT: v_readfirstlane_b32 s1, v0 @@ -5042,14 +5022,15 @@ define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x doub ; ; GFX11-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_lshr_b32 s5, s4, 16 ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s4 -; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_lshr_b32 s4, s4, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s4 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1 ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-NEXT: ; return to shader part epilog %sign.ext = fpext <2 x bfloat> %sign to <2 x double> @@ -5886,99 +5867,88 @@ define <3 x double> @v_copysign_out_v3f64_mag_v3f64_sign_v3bf16(<3 x double> %ma ; GCN-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 -; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v8 -; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v9 -; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; GCN-NEXT: v_bfi_b32 v1, s4, v1, v11 -; GCN-NEXT: v_bfi_b32 v3, s4, v3, v9 -; GCN-NEXT: v_bfi_b32 v5, s4, v5, v7 +; GCN-NEXT: v_bfi_b32 v1, s4, v1, v6 +; GCN-NEXT: v_bfi_b32 v3, s4, v3, v7 +; GCN-NEXT: v_bfi_b32 v5, s4, v5, v8 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 ; GFX7-NEXT: s_brev_b32 s4, -2 -; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v7 -; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v9 -; GFX7-NEXT: v_bfi_b32 v5, s4, v5, v11 +; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v6 +; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v7 +; GFX7-NEXT: v_bfi_b32 v5, s4, v5, v8 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v7 -; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v9 -; GFX8-NEXT: v_bfi_b32 v5, s4, v5, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v8 +; GFX8-NEXT: v_bfi_b32 v5, s4, v5, v7 +; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v7 -; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v9 -; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v8 +; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v7 +; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v6 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v8 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v9 -; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v7 -; GFX10-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v9 -; GFX10-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v6 +; GFX10-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v7 +; GFX10-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v6 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v8 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v9 -; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v11 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v7 +; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v8 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v6 +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v6 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v7 +; GFX11FAKE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %sign.ext = fpext <3 x bfloat> %sign to <3 x double> %out = call <3 x double> @llvm.copysign.v3f64(<3 x double> %mag, <3 x double> %sign.ext) ret <3 x double> %out @@ -7060,76 +7030,52 @@ define <4 x double> @v_copysign_out_v4f64_mag_v4f64_sign_v4bf16(<4 x double> %ma ; GCN-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v9 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v8 -; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v11 -; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 -; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 -; GCN-NEXT: v_bfi_b32 v1, s4, v1, v15 -; GCN-NEXT: v_bfi_b32 v3, s4, v3, v13 -; GCN-NEXT: v_bfi_b32 v5, s4, v5, v11 -; GCN-NEXT: v_bfi_b32 v7, s4, v7, v9 +; GCN-NEXT: v_bfi_b32 v1, s4, v1, v8 +; GCN-NEXT: v_bfi_b32 v3, s4, v3, v9 +; GCN-NEXT: v_bfi_b32 v5, s4, v5, v10 +; GCN-NEXT: v_bfi_b32 v7, s4, v7, v11 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v11 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v12 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v13 -; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v8 ; GFX7-NEXT: s_brev_b32 s4, -2 -; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v11 -; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v13 -; GFX7-NEXT: v_bfi_b32 v5, s4, v5, v15 -; GFX7-NEXT: v_bfi_b32 v7, s4, v7, v9 +; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v8 +; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v9 +; GFX7-NEXT: v_bfi_b32 v5, s4, v5, v10 +; GFX7-NEXT: v_bfi_b32 v7, s4, v7, v11 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; GFX8-NEXT: v_bfi_b32 v5, s4, v5, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX8-NEXT: v_bfi_b32 v5, s4, v5, v10 ; GFX8-NEXT: v_bfi_b32 v7, s4, v7, v8 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v10 ; GFX9-NEXT: v_bfi_b32 v7, s4, v7, v8 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index c8de7bc9d9de6..9d031a879d938 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -3365,11 +3365,7 @@ define <2 x double> @v_copysign_out_v2f64_mag_v2f64_sign_v2f16(<2 x double> %mag ; SI-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 ; SI-NEXT: v_bfi_b32 v3, s4, v3, v5 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3377,22 +3373,22 @@ define <2 x double> @v_copysign_out_v2f64_mag_v2f64_sign_v2f16(<2 x double> %mag ; VI-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; VI-NEXT: s_brev_b32 s4, -2 -; VI-NEXT: v_bfi_b32 v1, s4, v1, v4 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_bfi_b32 v1, s4, v1, v5 ; VI-NEXT: v_bfi_b32 v3, s4, v3, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v5 ; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4294,57 +4290,56 @@ define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f32_sign_v2f16(<2 x float define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2f16(<2 x double> inreg %mag, <2 x half> inreg %sign) { ; SI-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v0, s5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s4 -; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_mov_b32_e32 v2, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_bfi_b32 v0, s4, v2, v0 -; SI-NEXT: v_mov_b32_e32 v2, s1 -; SI-NEXT: v_bfi_b32 v1, s4, v2, v1 +; SI-NEXT: s_brev_b32 s6, -2 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_bfi_b32 v0, s6, v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_bfi_b32 v1, s6, v1, v2 ; SI-NEXT: v_readfirstlane_b32 s1, v1 ; SI-NEXT: v_readfirstlane_b32 s3, v0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_lshr_b32 s5, s4, 16 ; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s4 -; VI-NEXT: s_brev_b32 s4, -2 +; VI-NEXT: s_brev_b32 s5, -2 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_bfi_b32 v0, s4, v1, v0 -; VI-NEXT: v_lshlrev_b32_e64 v1, 16, s5 +; VI-NEXT: s_lshr_b32 s1, s4, 16 +; VI-NEXT: v_bfi_b32 v0, s5, v1, v0 +; VI-NEXT: v_lshlrev_b32_e64 v1, 16, s1 ; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_bfi_b32 v1, s4, v2, v1 +; VI-NEXT: v_bfi_b32 v1, s5, v2, v1 ; VI-NEXT: v_readfirstlane_b32 s1, v0 ; VI-NEXT: v_readfirstlane_b32 s3, v1 ; VI-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s4 -; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: s_brev_b32 s5, -2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, 16, s5 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: v_bfi_b32 v0, s5, v1, v0 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, 16, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX9-NEXT: v_bfi_b32 v1, s5, v2, v1 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 ; GFX9-NEXT: v_readfirstlane_b32 s3, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_lshr_b32 s5, s4, 16 ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s4 -; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_lshr_b32 s4, s4, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s4 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1 ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-NEXT: ; return to shader part epilog %sign.ext = fpext <2 x half> %sign to <2 x double> @@ -5206,13 +5201,7 @@ define <3 x double> @v_copysign_out_v3f64_mag_v3f64_sign_v3f16(<3 x double> %mag ; SI-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 ; SI-NEXT: v_bfi_b32 v3, s4, v3, v7 ; SI-NEXT: v_bfi_b32 v5, s4, v5, v8 @@ -5221,67 +5210,57 @@ define <3 x double> @v_copysign_out_v3f64_mag_v3f64_sign_v3f16(<3 x double> %mag ; VI-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v8, v6 -; VI-NEXT: v_cvt_f32_f16_e32 v9, v7 -; VI-NEXT: v_cvt_f32_f16_sdwa v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; VI-NEXT: s_brev_b32 s4, -2 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v8 -; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v9 -; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; VI-NEXT: v_bfi_b32 v1, s4, v1, v7 -; VI-NEXT: v_bfi_b32 v5, s4, v5, v9 -; VI-NEXT: v_bfi_b32 v3, s4, v3, v11 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_bfi_b32 v1, s4, v1, v8 +; VI-NEXT: v_bfi_b32 v5, s4, v5, v7 +; VI-NEXT: v_bfi_b32 v3, s4, v3, v6 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v8, v6 -; GFX9-NEXT: v_cvt_f32_f16_e32 v9, v7 -; GFX9-NEXT: v_cvt_f32_f16_sdwa v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v8 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v9 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v7 -; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v9 -; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v8 +; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v7 +; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v6 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v8, v6.l -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v9, v7.l -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v10, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[6:7], v8 -; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[8:9], v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v9 -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v11 +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v7 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v6 -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v9, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v10, v8 -; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[8:9], v9 -; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v7 -; GFX11-FAKE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v9 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v11 +; GFX11-FAKE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v7 +; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %sign.ext = fpext <3 x half> %sign to <3 x double> %out = call <3 x double> @llvm.copysign.v3f64(<3 x double> %mag, <3 x double> %sign.ext) @@ -6483,14 +6462,6 @@ define <4 x double> @v_copysign_out_v4f64_mag_v4f64_sign_v4f16(<4 x double> %mag ; SI-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v8 ; SI-NEXT: v_bfi_b32 v3, s4, v3, v9 @@ -6501,32 +6472,32 @@ define <4 x double> @v_copysign_out_v4f64_mag_v4f64_sign_v4f16(<4 x double> %mag ; VI-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; VI-NEXT: s_brev_b32 s4, -2 -; VI-NEXT: v_bfi_b32 v1, s4, v1, v8 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; VI-NEXT: v_bfi_b32 v5, s4, v5, v8 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_bfi_b32 v3, s4, v3, v8 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; VI-NEXT: v_bfi_b32 v1, s4, v1, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_bfi_b32 v5, s4, v5, v10 ; VI-NEXT: v_bfi_b32 v7, s4, v7, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v10 ; GFX9-NEXT: v_bfi_b32 v7, s4, v7, v8 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits