https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142157
>From ff07bad7e0442c2b4deabadda4d5242e9b190451 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Fri, 30 May 2025 12:15:33 +0200 Subject: [PATCH] AMDGPU: Handle vectors in copysign sign type combine This avoids some ugly codegen on pre-16-bit instruction targets now from annoying f16 legalization effects. This also avoids regressions on newer targets in a future patch. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 35 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 174 ++++--- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 501 ++------------------- 3 files changed, 129 insertions(+), 581 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index af85c6bef273d..c61c52ec5843e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11737,9 +11737,10 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, // lower half with a copy. // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) EVT MagVT = MagnitudeOp.getValueType(); - if (MagVT.getScalarType() == MVT::f64) { - unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + + if (MagVT.getScalarType() == MVT::f64) { EVT F32VT = MagVT.isVector() ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) : MVT::v2f32; @@ -11777,7 +11778,7 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts); } - if (SignVT != MVT::f64) + if (SignVT.getScalarType() != MVT::f64) return SDValue(); // Reduce width of sign operand, we only need the highest bit. @@ -11785,13 +11786,31 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, // fcopysign f64:x, f64:y -> // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1) // TODO: In some cases it might make sense to go all the way to f16. - SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp); - SDValue SignAsF32 = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, - DAG.getConstant(1, DL, MVT::i32)); + + EVT F32VT = MagVT.isVector() + ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) + : MVT::v2f32; + + SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp); + + SmallVector<SDValue, 8> F32Signs; + for (unsigned I = 0; I != NumElts; ++I) { + // Take sign from odd elements of cast vector + SDValue SignAsF32 = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, + DAG.getConstant(2 * I + 1, DL, MVT::i32)); + F32Signs.push_back(SignAsF32); + } + + SDValue NewSign = + NumElts == 1 + ? F32Signs.back() + : DAG.getNode(ISD::BUILD_VECTOR, DL, + EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts), + F32Signs); return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0), - SignAsF32); + NewSign); } // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 3bd068362410b..26ea80a802f91 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -4677,37 +4677,33 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> %m ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] -; GCN-NEXT: v_cvt_f32_f64_e32 v3, v[4:5] +; GCN-NEXT: v_and_b32_e32 v2, 0x80000000, v5 +; GCN-NEXT: v_and_b32_e32 v3, 0x80000000, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v3 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] -; GFX7-NEXT: v_cvt_f32_f64_e32 v3, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0x80000000, v5 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0x80000000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: @@ -5585,16 +5581,14 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> i ; GCN: ; %bb.0: ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1 ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0 -; GCN-NEXT: v_cvt_f32_f64_e32 v2, s[4:5] -; GCN-NEXT: v_cvt_f32_f64_e32 v3, s[2:3] -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: s_and_b32 s0, s3, 0x80000000 +; GCN-NEXT: s_and_b32 s1, s5, 0x80000000 +; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GCN-NEXT: s_lshr_b32 s1, s1, 16 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: v_or_b32_e32 v1, s0, v1 +; GCN-NEXT: v_or_b32_e32 v0, s1, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 @@ -5602,18 +5596,16 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> i ; ; GFX7-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; GFX7-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s1 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 -; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 +; GFX7-NEXT: s_and_b32 s0, s3, 0x80000000 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s1 +; GFX7-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX7-NEXT: s_and_b32 s0, s5, 0x80000000 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 @@ -6682,51 +6674,45 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64(<3 x bfloat> %m ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_cvt_f32_f64_e32 v3, v[3:4] -; GCN-NEXT: v_cvt_f32_f64_e32 v4, v[5:6] -; GCN-NEXT: v_cvt_f32_f64_e32 v5, v[7:8] +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_and_b32_e32 v3, 0x80000000, v6 +; GCN-NEXT: v_and_b32_e32 v5, 0x80000000, v8 +; GCN-NEXT: v_and_b32_e32 v4, 0x80000000, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 ; GCN-NEXT: v_or_b32_e32 v2, v2, v5 -; GCN-NEXT: v_or_b32_e32 v1, v1, v4 -; GCN-NEXT: v_or_b32_e32 v0, v0, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, v0, v4 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f64_e32 v3, v[3:4] -; GFX7-NEXT: v_cvt_f32_f64_e32 v4, v[5:6] -; GFX7-NEXT: v_cvt_f32_f64_e32 v5, v[7:8] -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0x80000000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0x80000000, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0x80000000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -8082,66 +8068,58 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64(<4 x bfloat> %m ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_cvt_f32_f64_e32 v4, v[4:5] -; GCN-NEXT: v_cvt_f32_f64_e32 v5, v[6:7] -; GCN-NEXT: v_cvt_f32_f64_e32 v6, v[8:9] -; GCN-NEXT: v_cvt_f32_f64_e32 v7, v[10:11] +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_and_b32_e32 v4, 0x80000000, v7 +; GCN-NEXT: v_and_b32_e32 v6, 0x80000000, v11 +; GCN-NEXT: v_and_b32_e32 v7, 0x80000000, v9 +; GCN-NEXT: v_and_b32_e32 v5, 0x80000000, v5 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT: v_and_b32_e32 v7, 0x8000, v7 -; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v6 -; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 -; GCN-NEXT: v_or_b32_e32 v3, v3, v7 -; GCN-NEXT: v_or_b32_e32 v2, v2, v6 -; GCN-NEXT: v_or_b32_e32 v1, v1, v5 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v1, v1, v4 +; GCN-NEXT: v_or_b32_e32 v3, v3, v6 +; GCN-NEXT: v_or_b32_e32 v2, v2, v7 +; GCN-NEXT: v_or_b32_e32 v0, v0, v5 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cvt_f32_f64_e32 v4, v[4:5] -; GFX7-NEXT: v_cvt_f32_f64_e32 v5, v[6:7] -; GFX7-NEXT: v_cvt_f32_f64_e32 v6, v[8:9] -; GFX7-NEXT: v_cvt_f32_f64_e32 v7, v[10:11] -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0x80000000, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0x80000000, v11 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0x8000, v7 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v6, 0x8000, v6 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0x80000000, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0x80000000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64: diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index 9d031a879d938..1c0b5f97ed173 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -4013,96 +4013,13 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f16_sign_v2f64(<2 x half> %mag, <2 ; SI-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0x1ff, v5 -; SI-NEXT: v_or_b32_e32 v4, v7, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v5 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_and_b32_e32 v6, 0xffe, v6 -; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v7, v5, 20, 11 -; SI-NEXT: s_movk_i32 s4, 0x3f1 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_sub_i32_e32 v8, vcc, s4, v7 -; SI-NEXT: v_or_b32_e32 v6, 0x1000, v4 -; SI-NEXT: v_med3_i32 v8, v8, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v9, v8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, v8, v9 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v6 -; SI-NEXT: s_movk_i32 s5, 0xfc10 -; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, s5, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 12, v7 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: v_or_b32_e32 v8, v4, v8 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 -; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; SI-NEXT: v_and_b32_e32 v8, 7, v6 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8 -; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 -; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; SI-NEXT: v_mov_b32_e32 v8, 0x7c00 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 -; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; SI-NEXT: v_mov_b32_e32 v9, 0x7e00 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: s_movk_i32 s6, 0x40f -; SI-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 -; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0x1ff, v3 -; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffe, v5 -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v6, v3, 20, 11 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v6 -; SI-NEXT: v_or_b32_e32 v5, 0x1000, v2 -; SI-NEXT: v_med3_i32 v7, v7, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v10, v7, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v10 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v5 -; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v6, vcc, s5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v6 -; SI-NEXT: v_or_b32_e32 v5, v10, v5 -; SI-NEXT: v_or_b32_e32 v7, v2, v7 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6 -; SI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc -; SI-NEXT: v_and_b32_e32 v7, 7, v5 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 -; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 -; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v7, v7, v10 -; SI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f64: @@ -4900,99 +4817,16 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f32(<2 x half> inreg define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f64(<2 x half> inreg %mag, <2 x double> inreg %sign) { ; SI-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64: ; SI: ; %bb.0: -; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 -; SI-NEXT: s_lshr_b32 s0, s3, 8 -; SI-NEXT: s_and_b32 s6, s0, 0xffe -; SI-NEXT: s_and_b32 s0, s3, 0x1ff -; SI-NEXT: s_or_b32 s0, s0, s2 -; SI-NEXT: s_cmp_lg_u32 s0, 0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s1 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; SI-NEXT: v_readfirstlane_b32 s0, v2 -; SI-NEXT: s_bfe_u32 s2, s3, 0xb0014 -; SI-NEXT: s_or_b32 s0, s6, s0 -; SI-NEXT: s_sub_i32 s6, 0x3f1, s2 -; SI-NEXT: v_med3_i32 v2, s6, 0, 13 -; SI-NEXT: s_or_b32 s1, s0, 0x1000 -; SI-NEXT: v_readfirstlane_b32 s6, v2 -; SI-NEXT: s_lshr_b32 s7, s1, s6 -; SI-NEXT: s_lshl_b32 s6, s7, s6 -; SI-NEXT: s_cmp_lg_u32 s6, s1 -; SI-NEXT: s_cselect_b32 s1, 1, 0 -; SI-NEXT: s_addk_i32 s2, 0xfc10 -; SI-NEXT: s_lshl_b32 s6, s2, 12 -; SI-NEXT: s_or_b32 s1, s7, s1 -; SI-NEXT: s_or_b32 s6, s0, s6 -; SI-NEXT: s_cmp_lt_i32 s2, 1 -; SI-NEXT: s_cselect_b32 s1, s1, s6 -; SI-NEXT: s_and_b32 s6, s1, 7 -; SI-NEXT: s_cmp_gt_i32 s6, 5 -; SI-NEXT: s_cselect_b32 s7, 1, 0 -; SI-NEXT: s_cmp_eq_u32 s6, 3 -; SI-NEXT: s_cselect_b32 s6, 1, 0 -; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_lshr_b32 s1, s1, 2 -; SI-NEXT: s_add_i32 s1, s1, s6 -; SI-NEXT: s_cmp_lt_i32 s2, 31 -; SI-NEXT: s_cselect_b32 s1, s1, 0x7c00 -; SI-NEXT: s_cmp_lg_u32 s0, 0 -; SI-NEXT: s_movk_i32 s6, 0x7e00 -; SI-NEXT: s_cselect_b32 s0, s6, 0x7c00 -; SI-NEXT: s_cmpk_eq_i32 s2, 0x40f -; SI-NEXT: s_cselect_b32 s0, s0, s1 -; SI-NEXT: s_lshr_b32 s1, s3, 16 -; SI-NEXT: s_and_b32 s1, s1, 0x8000 -; SI-NEXT: s_or_b32 s2, s1, s0 -; SI-NEXT: s_lshr_b32 s0, s5, 8 -; SI-NEXT: s_and_b32 s3, s0, 0xffe -; SI-NEXT: s_and_b32 s0, s5, 0x1ff -; SI-NEXT: s_or_b32 s0, s0, s4 -; SI-NEXT: s_cmp_lg_u32 s0, 0 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; SI-NEXT: v_readfirstlane_b32 s0, v2 -; SI-NEXT: s_or_b32 s0, s3, s0 -; SI-NEXT: s_bfe_u32 s3, s5, 0xb0014 -; SI-NEXT: s_sub_i32 s4, 0x3f1, s3 -; SI-NEXT: v_med3_i32 v2, s4, 0, 13 -; SI-NEXT: s_or_b32 s1, s0, 0x1000 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: s_lshr_b32 s7, s1, s4 -; SI-NEXT: s_lshl_b32 s4, s7, s4 -; SI-NEXT: s_cmp_lg_u32 s4, s1 -; SI-NEXT: s_cselect_b32 s1, 1, 0 -; SI-NEXT: s_addk_i32 s3, 0xfc10 -; SI-NEXT: s_lshl_b32 s4, s3, 12 -; SI-NEXT: s_or_b32 s1, s7, s1 -; SI-NEXT: s_or_b32 s4, s0, s4 -; SI-NEXT: s_cmp_lt_i32 s3, 1 -; SI-NEXT: s_cselect_b32 s1, s1, s4 -; SI-NEXT: s_and_b32 s4, s1, 7 -; SI-NEXT: s_cmp_gt_i32 s4, 5 -; SI-NEXT: s_cselect_b32 s7, 1, 0 -; SI-NEXT: s_cmp_eq_u32 s4, 3 -; SI-NEXT: s_cselect_b32 s4, 1, 0 -; SI-NEXT: s_or_b32 s4, s4, s7 -; SI-NEXT: s_lshr_b32 s1, s1, 2 -; SI-NEXT: s_add_i32 s1, s1, s4 -; SI-NEXT: s_cmp_lt_i32 s3, 31 -; SI-NEXT: s_cselect_b32 s1, s1, 0x7c00 -; SI-NEXT: s_cmp_lg_u32 s0, 0 -; SI-NEXT: s_cselect_b32 s0, s6, 0x7c00 -; SI-NEXT: s_cmpk_eq_i32 s3, 0x40f -; SI-NEXT: s_cselect_b32 s0, s0, s1 -; SI-NEXT: s_lshr_b32 s1, s5, 16 -; SI-NEXT: s_and_b32 s1, s1, 0x8000 -; SI-NEXT: s_or_b32 s0, s1, s0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_mov_b32_e32 v2, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: v_bfi_b32 v0, s0, v0, v2 +; SI-NEXT: v_mov_b32_e32 v2, s3 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_bfi_b32 v1, s0, v1, v3 +; SI-NEXT: v_bfi_b32 v1, s0, v1, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -6079,138 +5913,16 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f16_sign_v3f64(<3 x half> %mag, <3 ; SI-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0x1ff, v8 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: v_and_b32_e32 v9, 0xffe, v9 -; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v10, v8, 20, 11 -; SI-NEXT: s_movk_i32 s4, 0x3f1 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_sub_i32_e32 v11, vcc, s4, v10 -; SI-NEXT: v_or_b32_e32 v9, 0x1000, v7 -; SI-NEXT: v_med3_i32 v11, v11, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v12, v11, v9 -; SI-NEXT: v_lshlrev_b32_e32 v11, v11, v12 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v11, v9 -; SI-NEXT: s_movk_i32 s5, 0xfc10 -; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v10, vcc, s5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 12, v10 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v11, v7, v11 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v10 -; SI-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc -; SI-NEXT: v_and_b32_e32 v11, 7, v9 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11 -; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11 -; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_lshrrev_b32_e32 v9, 2, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; SI-NEXT: v_mov_b32_e32 v11, 0x7c00 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v10 -; SI-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc -; SI-NEXT: v_mov_b32_e32 v12, 0x7e00 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: s_movk_i32 s6, 0x40f -; SI-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v10 -; SI-NEXT: v_cndmask_b32_e32 v7, v9, v7, vcc -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0x1ff, v6 -; SI-NEXT: v_and_b32_e32 v8, 0x8000, v8 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v6 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: v_and_b32_e32 v8, 0xffe, v8 -; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v9, v6, 20, 11 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: v_sub_i32_e32 v10, vcc, s4, v9 -; SI-NEXT: v_or_b32_e32 v8, 0x1000, v5 -; SI-NEXT: v_med3_i32 v10, v10, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v13, v10, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, v10, v13 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v10, v8 -; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, s5, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 12, v9 -; SI-NEXT: v_or_b32_e32 v8, v13, v8 -; SI-NEXT: v_or_b32_e32 v10, v5, v10 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v9 -; SI-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc -; SI-NEXT: v_and_b32_e32 v10, 7, v8 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10 -; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10 -; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v10, v10, v13 -; SI-NEXT: v_lshrrev_b32_e32 v8, 2, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v9 -; SI-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v9 -; SI-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v8, 0x1ff, v4 -; SI-NEXT: v_and_b32_e32 v6, 0x8000, v6 -; SI-NEXT: v_or_b32_e32 v3, v8, v3 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v4 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: v_and_b32_e32 v6, 0xffe, v6 -; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v8, v4, 20, 11 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_sub_i32_e32 v9, vcc, s4, v8 -; SI-NEXT: v_or_b32_e32 v6, 0x1000, v3 -; SI-NEXT: v_med3_i32 v9, v9, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v10, v9, v6 -; SI-NEXT: v_lshlrev_b32_e32 v9, v9, v10 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v9, v6 -; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v8, vcc, s5, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 12, v8 -; SI-NEXT: v_or_b32_e32 v6, v10, v6 -; SI-NEXT: v_or_b32_e32 v9, v3, v9 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v8 -; SI-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc -; SI-NEXT: v_and_b32_e32 v9, 7, v6 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v9 -; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 -; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v11, v12, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f64: @@ -7589,180 +7301,19 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f16_sign_v4f64(<4 x half> %mag, <4 ; SI-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0x1ff, v11 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v11 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: v_and_b32_e32 v12, 0xffe, v12 -; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v13, v11, 20, 11 -; SI-NEXT: s_movk_i32 s4, 0x3f1 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_sub_i32_e32 v14, vcc, s4, v13 -; SI-NEXT: v_or_b32_e32 v12, 0x1000, v10 -; SI-NEXT: v_med3_i32 v14, v14, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v15, v14, v12 -; SI-NEXT: v_lshlrev_b32_e32 v14, v14, v15 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v14, v12 -; SI-NEXT: s_movk_i32 s5, 0xfc10 -; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v13, vcc, s5, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 12, v13 -; SI-NEXT: v_or_b32_e32 v12, v15, v12 -; SI-NEXT: v_or_b32_e32 v14, v10, v14 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v13 -; SI-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc -; SI-NEXT: v_and_b32_e32 v14, 7, v12 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v14 -; SI-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v14 -; SI-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: v_lshrrev_b32_e32 v12, 2, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; SI-NEXT: v_mov_b32_e32 v14, 0x7c00 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v13 -; SI-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc -; SI-NEXT: v_mov_b32_e32 v15, 0x7e00 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_movk_i32 s6, 0x40f -; SI-NEXT: v_cndmask_b32_e32 v10, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v13 -; SI-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0x1ff, v9 -; SI-NEXT: v_and_b32_e32 v11, 0x8000, v11 -; SI-NEXT: v_or_b32_e32 v8, v12, v8 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v9 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_and_b32_e32 v11, 0xffe, v11 -; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v12, v9, 20, 11 -; SI-NEXT: v_or_b32_e32 v8, v11, v8 -; SI-NEXT: v_sub_i32_e32 v13, vcc, s4, v12 -; SI-NEXT: v_or_b32_e32 v11, 0x1000, v8 -; SI-NEXT: v_med3_i32 v13, v13, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v16, v13, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, v13, v16 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v13, v11 -; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v12, vcc, s5, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 12, v12 -; SI-NEXT: v_or_b32_e32 v11, v16, v11 -; SI-NEXT: v_or_b32_e32 v13, v8, v13 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v12 -; SI-NEXT: v_cndmask_b32_e32 v11, v13, v11, vcc -; SI-NEXT: v_and_b32_e32 v13, 7, v11 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v13 -; SI-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v13 -; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v13, v13, v16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 2, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v12 -; SI-NEXT: v_cndmask_b32_e32 v11, v14, v11, vcc -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_cndmask_b32_e32 v8, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v12 -; SI-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v11, 0x1ff, v7 -; SI-NEXT: v_and_b32_e32 v9, 0x8000, v9 -; SI-NEXT: v_or_b32_e32 v6, v11, v6 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v7 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xffe, v9 -; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v11, v7, 20, 11 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: v_sub_i32_e32 v12, vcc, s4, v11 -; SI-NEXT: v_or_b32_e32 v9, 0x1000, v6 -; SI-NEXT: v_med3_i32 v12, v12, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v13, v12, v9 -; SI-NEXT: v_lshlrev_b32_e32 v12, v12, v13 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v12, v9 -; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v11, vcc, s5, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 12, v11 -; SI-NEXT: v_or_b32_e32 v9, v13, v9 -; SI-NEXT: v_or_b32_e32 v12, v6, v12 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v11 -; SI-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc -; SI-NEXT: v_and_b32_e32 v12, 7, v9 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v12 -; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 -; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_lshrrev_b32_e32 v9, 2, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v11 -; SI-NEXT: v_cndmask_b32_e32 v9, v14, v9, vcc -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_cndmask_b32_e32 v6, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v11 -; SI-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v9, 0x1ff, v5 -; SI-NEXT: v_and_b32_e32 v7, 0x8000, v7 -; SI-NEXT: v_or_b32_e32 v4, v9, v4 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v5 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_and_b32_e32 v7, 0xffe, v7 -; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; SI-NEXT: v_bfe_u32 v9, v5, 20, 11 -; SI-NEXT: v_or_b32_e32 v4, v7, v4 -; SI-NEXT: v_sub_i32_e32 v11, vcc, s4, v9 -; SI-NEXT: v_or_b32_e32 v7, 0x1000, v4 -; SI-NEXT: v_med3_i32 v11, v11, 0, 13 -; SI-NEXT: v_lshrrev_b32_e32 v12, v11, v7 -; SI-NEXT: v_lshlrev_b32_e32 v11, v11, v12 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, v11, v7 -; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, s5, v9 -; SI-NEXT: v_lshlrev_b32_e32 v11, 12, v9 -; SI-NEXT: v_or_b32_e32 v7, v12, v7 -; SI-NEXT: v_or_b32_e32 v11, v4, v11 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v9 -; SI-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc -; SI-NEXT: v_and_b32_e32 v11, 7, v7 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11 -; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11 -; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_lshrrev_b32_e32 v7, 2, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_cndmask_b32_e32 v4, v14, v15, vcc -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v9 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: s_brev_b32 s4, -2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v5 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v6 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v10 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v5 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v7 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v9 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f64: _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits