llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> --- Patch is 46.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142177.diff 3 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+4-2) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+688) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+307) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3535eb41682d9..1957e442dbabb 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -759,7 +759,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // Can do this in one BFI plus a constant materialize. setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16, - MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16}, + MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16, + MVT::v32f16, MVT::v32bf16}, Custom); setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); @@ -5943,7 +5944,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 || - VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 || + VT == MVT::v32bf16); auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 4bbd170529ad0..7c89a41d62fbf 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -2562,6 +2562,694 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign ret <16 x bfloat> %result } +define <32 x bfloat> @v_copysign_v32bf16(<32 x bfloat> %mag, <32 x bfloat> %sign) { +; GCN-LABEL: v_copysign_v32bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_bfe_u32 v32, v32, 16, 15 +; GCN-NEXT: v_and_b32_e32 v31, 0x8000, v31 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 +; GCN-NEXT: v_or_b32_e32 v31, v32, v31 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_bfe_u32 v30, v30, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 +; GCN-NEXT: v_or_b32_e32 v30, v30, v32 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_bfe_u32 v29, v29, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 +; GCN-NEXT: v_or_b32_e32 v29, v29, v32 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_bfe_u32 v28, v28, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 +; GCN-NEXT: v_or_b32_e32 v28, v28, v32 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_bfe_u32 v27, v27, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_or_b32_e32 v27, v27, v32 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_bfe_u32 v26, v26, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 +; GCN-NEXT: v_or_b32_e32 v26, v26, v32 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_bfe_u32 v25, v25, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 +; GCN-NEXT: v_or_b32_e32 v25, v25, v32 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_bfe_u32 v24, v24, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 +; GCN-NEXT: v_or_b32_e32 v24, v24, v32 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_bfe_u32 v23, v23, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; GCN-NEXT: v_or_b32_e32 v23, v23, v32 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_bfe_u32 v22, v22, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 +; GCN-NEXT: v_or_b32_e32 v22, v22, v32 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_bfe_u32 v21, v21, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 +; GCN-NEXT: v_or_b32_e32 v21, v21, v32 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_bfe_u32 v20, v20, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 +; GCN-NEXT: v_or_b32_e32 v20, v20, v32 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 +; GCN-NEXT: v_or_b32_e32 v19, v19, v32 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_bfe_u32 v18, v18, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 +; GCN-NEXT: v_or_b32_e32 v18, v18, v32 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_bfe_u32 v17, v17, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 +; GCN-NEXT: v_or_b32_e32 v17, v17, v32 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_bfe_u32 v16, v16, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; GCN-NEXT: v_or_b32_e32 v16, v16, v32 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_bfe_u32 v15, v15, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; GCN-NEXT: v_or_b32_e32 v15, v15, v32 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_bfe_u32 v14, v14, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 +; GCN-NEXT: v_or_b32_e32 v14, v14, v32 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_bfe_u32 v13, v13, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 +; GCN-NEXT: v_or_b32_e32 v13, v13, v32 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_bfe_u32 v12, v12, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; GCN-NEXT: v_or_b32_e32 v12, v12, v32 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_bfe_u32 v11, v11, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 +; GCN-NEXT: v_or_b32_e32 v11, v11, v32 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_bfe_u32 v10, v10, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; GCN-NEXT: v_or_b32_e32 v10, v10, v32 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_bfe_u32 v9, v9, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_or_b32_e32 v9, v9, v32 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_bfe_u32 v8, v8, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; GCN-NEXT: v_or_b32_e32 v8, v8, v32 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_bfe_u32 v7, v7, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; GCN-NEXT: v_or_b32_e32 v7, v7, v32 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_bfe_u32 v6, v6, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 +; GCN-NEXT: v_or_b32_e32 v6, v6, v32 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_bfe_u32 v5, v5, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; GCN-NEXT: v_or_b32_e32 v5, v5, v32 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_bfe_u32 v4, v4, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_or_b32_e32 v4, v4, v32 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GCN-NEXT: v_or_b32_e32 v3, v3, v32 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_or_b32_e32 v2, v2, v32 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_or_b32_e32 v1, v1, v32 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT: v_or_b32_e32 v0, v0, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_copysign_v32bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_bfe_u32 v30, v30, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_bfe_u32 v29, v29, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_bfe_u32 v28, v28, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_bfe_u32 v27, v27, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: v_bfe_u32 v26, v26, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_bfe_u32 v25, v25, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_bfe_u32 v24, v24, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GFX7-NEXT: v_bfe_u32 v23, v23, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_bfe_u32 v22, v22, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_bfe_u32 v21, v21, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_bfe_u32 v20, v20, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_bfe_u32 v19, v19, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_bfe_u32 v18, v18, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_bfe_u32 v17, v17, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_bfe_u32 v16, v16, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_bfe_u32 v15, v15, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_bfe_u32 v14, v14, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_bfe_u32 v13, v13, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_bfe_u32 v12, v12, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_bfe_u32 v11, v11, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_bfe_u32 v10, v10, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_bfe_u32 v9, v9, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_bfe_u32 v8, v8, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_bfe_u32 v7, v7, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: v_bfe_u32 v32, v32, 16, 15 +; GFX7-NEXT: v_and_b32_e32 v31, 0x8000, v31 +; GFX7-NEXT: v_or_b32_e32 v31, v32, v31 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GFX7-NEXT: v_or_b32_e32 v30, v30, v32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 +; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GFX7-NEXT: v_or_b32_e32 v29, v29, v32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 +; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GFX7-NEXT: v_or_b32_e32 v28, v28, v32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GFX7-NEXT: v_or_b32_e32 v27, v27, v32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 +; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 +; GFX7-NEXT: v_or_b32_e32 v26, v26, v32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 +; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX7-NEXT: s_w... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/142177 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits