llvmbot wrote:

<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

<details>
<summary>Changes</summary>

Fixes #<!-- -->141931

---

Patch is 153.02 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/142173.diff


6 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+31) 
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+1) 
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+10) 
- (modified) llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll (+1-6) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+515-610) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+550-649) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c61c52ec5843e..ab3c316f76deb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -756,6 +756,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     // allows matching fneg (fabs x) patterns)
     setOperationAction(ISD::FABS, MVT::v2f16, Legal);
 
+    // Can do this in one BFI plus a constant materialize.
+    setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16}, Custom);
+
     setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
     setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, 
Legal);
 
@@ -6088,6 +6091,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, 
SelectionDAG &DAG) const {
   case ISD::SADDSAT:
   case ISD::SSUBSAT:
     return splitBinaryVectorOp(Op, DAG);
+  case ISD::FCOPYSIGN:
+    return lowerFCOPYSIGN(Op, DAG);
   case ISD::MUL:
     return lowerMUL(Op, DAG);
   case ISD::SMULO:
@@ -7115,6 +7120,32 @@ SDValue SITargetLowering::promoteUniformOpToI32(SDValue 
Op,
   return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
 }
 
+SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Mag = Op.getOperand(0);
+  SDValue Sign = Op.getOperand(1);
+
+  EVT MagVT = Mag.getValueType();
+  EVT SignVT = Sign.getValueType();
+
+  assert(MagVT.isVector());
+
+  if (MagVT == SignVT)
+    return Op;
+
+  assert(MagVT.getVectorNumElements() == 2);
+
+  // fcopysign v2f16:mag, v2f32:sign ->
+  //   fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
+
+  SDLoc SL(Op);
+  SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
+  SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, 
SignAsInt32);
+
+  SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
+
+  return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
+}
+
 // Custom lowering for vector multiplications and s_mul_u64.
 SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h 
b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c42366a1c04c8..283f8136d352a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -149,6 +149,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
   SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
+  SDValue lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td 
b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 2e2913d88cc54..28557ad516865 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2062,6 +2062,16 @@ def : GCNPat <
 >;
 } // End foreach fp16vt = [f16, bf16]
 
+
+foreach fp16vt = [v2f16, v2bf16] in {
+
+def : GCNPat <
+  (fcopysign fp16vt:$src0, fp16vt:$src1),
+  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src0, $src1)
+>;
+
+}
+
 /********** ================== **********/
 /********** Immediate Patterns **********/
 /********** ================== **********/
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll 
b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index 15b049d4d7563..021104114d796 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -36,17 +36,12 @@ define <2 x half> 
@test_pown_reduced_fast_v2f16_known_odd(<2 x half> %x, <2 x i3
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v0
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v1, v2
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v3, v1
-; GFX9-NEXT:    v_bfi_b32 v2, s4, v1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v1, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %y = or <2 x i32> %y.arg, <i32 1, i32 1>
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 26ea80a802f91..a5a36d7122f68 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -996,63 +996,34 @@ define amdgpu_ps i32 @s_copysign_v2bf16(<2 x bfloat> 
inreg %arg_mag, <2 x bfloat
 ;
 ; GFX8-LABEL: s_copysign_v2bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX8-NEXT:    s_mov_b32 s2, 0x7fff7fff
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX8-NEXT:    v_bfi_b32 v0, s2, v0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_bfi_b32 v1, s2, v1, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_copysign_v2bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX9-NEXT:    s_mov_b32 s2, 0x7fff7fff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX9-NEXT:    v_bfi_b32 v0, s2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_bfi_b32 v1, s2, v1, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_copysign_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX10-NEXT:    v_bfi_b32 v1, 0x7fff, s0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff7fff, s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_copysign_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | 
instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_bfi_b32 v1, 0x7fff, s0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | 
instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | 
instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff7fff, s0, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %out = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %arg_mag, <2 x 
bfloat> %arg_sign)
@@ -2313,62 +2284,28 @@ define <2 x bfloat> @v_copysign_v2bf16(<2 x bfloat> 
%mag, <2 x bfloat> %sign) {
 ; GFX8-LABEL: v_copysign_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX8-NEXT:    v_bfi_b32 v2, s4, v0, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_mov_b32 s4, 0x7fff7fff
 ; GFX8-NEXT:    v_bfi_b32 v0, s4, v0, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfi_b32 v2, s4, v0, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
-; GFX10-NEXT:    v_bfi_b32 v1, 0x7fff, v3, v2
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11TRUE16-LABEL: v_copysign_v2bf16:
-; GFX11TRUE16:       ; %bb.0:
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | 
instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_bfi_b32 v2, 0x7fff, v2, v3
-; GFX11TRUE16-NEXT:    v_bfi_b32 v1, 0x7fff, v0, v1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | 
instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: v_copysign_v2bf16:
-; GFX11FAKE16:       ; %bb.0:
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | 
instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_bfi_b32 v1, 0x7fff, v3, v2
-; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_copysign_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %mag, <2 x 
bfloat> %sign)
   ret <2 x bfloat> %result
 }
@@ -4171,49 +4108,42 @@ define <2 x bfloat> 
@v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> %ma
 ; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
 ; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT:    s_mov_b32 s4, 0x7fff7fff
 ; GFX8-NEXT:    v_bfi_b32 v0, s4, v0, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_bfi_b32 v1, s4, v1, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
 ; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16:
@@ -4228,13 +4158,9 @@ define <2 x bfloat> 
@v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> %ma
 ; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_bfi_b32 v1, 0x7fff, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11TRUE16-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16:
@@ -4247,19 +4173,15 @@ define <2 x bfloat> 
@v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> %ma
 ; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | 
instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | 
instid1(VALU_DEP_3)
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.h
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | 
instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | 
instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | 
instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v2
-; GFX11TRUE16-NEXT:    v_bfi_b32 v1, 0x7fff, v1, v3
+; GFX11TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v2
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16:
@@ -4272,19 +4194,13 @@ define <2 x bfloat> 
@v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> %ma
 ; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | 
instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | 
instid1(VALU_DEP_3)
 ; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
 ; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | 
instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | 
instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v2
-; GFX11FAKE16-NEXT:    v_bfi_b32 v1, 0x7fff, v1, v3
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | 
instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v2
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mag.trunc = fptrunc <2 x float> %mag to <2 x bfloat>
   %out = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %mag.trunc, <2 x 
bfloat> %sign)
@@ -4333,89 +4249,83 @@ define <2 x bfloat> 
@v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16(<2 x double> %m
 ; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_f64_e32 v7, v[2:3]
-; GFX8-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX8-NEXT:    v_cvt_f32_f64_e32 v8, v[0:1]
+; GFX8-NEXT:    v_cvt_f32_f64_e32 v7, v[0:1]
+; GFX8-NEXT:    v_cvt_f32_f64_e32 v8, v[2:3]
 ; GFX8-NEXT:    v_cvt_f64_f32_e32 v[5:6], v7
 ; GFX8-NEXT:    v_and_b32_e32 v9, 1, v7
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v9
-; GFX8-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, |v[5:6]|
-; GFX8-NEXT:    v_cmp_nlg_f64_e32 vcc, v[2:3], v[5:6]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[5:6]|
+; GFX8-NEXT:    v_cmp_nlg_f64_e32 vcc, v[0:1], v[5:6]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, -1, 1, s[6:7]
 ; GFX8-NEXT:    v_add_u32_e64 v5, s[6:7], v7, v5
 ; GFX8-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v5, v7, vcc
 ; GFX8-NEXT:    v_bfe_u32 v5, v7, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v7
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s8, v5
-; GFX8-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v5, v7
 ; GFX8-NEXT:    v_cvt_f64_f32_e32 v[5:6], v8
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
+; GFX8-NEXT:    v_cmp_u_f64_e64 s[4:5], v[0:1], v[0:1]
+; GFX8-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, |v[5:6]|
+; GFX8-NEXT:    v_cmp_nlg_f64_e32 vcc, v[2:3], v[5:6]
 ; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v7
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v8
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v3
-; GFX8-NEXT:    v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[5:6]|
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v7, vcc
-; GFX8-NEXT:    v_cmp_nlg_f64_e32 vcc, v[0:1], v[5:6]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, -1, 1, s[6:7]
-; GFX8-NEXT:    v_add_u32_e64 v3, s[6:7], v8, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v9, v7, s[4:5]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, -1, 1,...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/142173
_______________________________________________
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to