llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> --- Patch is 365.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142115.diff 2 Files Affected: - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+4746-3) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+3652) ``````````diff diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 4fcce8a6d623f..e99a6bf273e3b 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -954,6 +954,4749 @@ define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg %ins.1 = insertelement <2 x i32> %ins.0, i32 %readlane1, i32 1 ret <2 x i32> %ins.1 } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11FAKE16: {{.*}} -; GFX11TRUE16: {{.*}} + +define amdgpu_ps i32 @s_copysign_v2bf16(<2 x bfloat> inreg %arg_mag, <2 x bfloat> inreg %arg_sign) { +; GCN-LABEL: s_copysign_v2bf16: +; GCN: ; %bb.0: +; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s3 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s2 +; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1 +; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 +; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 +; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: s_copysign_v2bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s2 +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s0 +; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 +; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 +; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_copysign_v2bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_movk_i32 s2, 0x7fff +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_copysign_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_movk_i32 s2, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_copysign_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX10-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_copysign_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog + %out = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %arg_mag, <2 x bfloat> %arg_sign) + %cast = bitcast <2 x bfloat> %out to i32 + ret i32 %cast +} + +define amdgpu_ps <3 x i16> @s_copysign_v3bf16(<3 x bfloat> inreg %arg_mag, <3 x bfloat> inreg %arg_sign) { +; GCN-LABEL: s_copysign_v3bf16: +; GCN: ; %bb.0: +; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s5 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s3 +; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s2 +; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s1 +; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_bfe_u32 v5, v5, 16, 15 +; GCN-NEXT: v_bfe_u32 v4, v4, 16, 15 +; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 +; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 +; GCN-NEXT: v_or_b32_e32 v2, v5, v2 +; GCN-NEXT: v_or_b32_e32 v1, v4, v1 +; GCN-NEXT: v_or_b32_e32 v0, v3, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v1 +; GCN-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GCN-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NEXT: v_readfirstlane_b32 s0, v2 +; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: s_copysign_v3bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s5 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s3 +; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 +; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s0 +; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 15 +; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 +; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v1 +; GFX7-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s0, v2 +; GFX7-NEXT: v_readfirstlane_b32 s2, v0 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_copysign_v3bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshr_b32 s1, s2, 16 +; GFX8-NEXT: s_lshr_b32 s3, s0, 16 +; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_bfi_b32 v2, s4, v2, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s1, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_copysign_v3bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_lshr_b32 s1, s2, 16 +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_copysign_v3bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_lshr_b32 s2, s2, 16 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX10-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, s1, v2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_copysign_v3bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: ; return to shader part epilog + %out = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> %arg_mag, <3 x bfloat> %arg_sign) + %cast = bitcast <3 x bfloat> %out to <3 x i16> + ret <3 x i16> %cast +} + +define amdgpu_ps <2 x i32> @s_copysign_v4bf16(<4 x bfloat> inreg %arg_mag, <4 x bfloat> inreg %arg_sign) { +; GCN-LABEL: s_copysign_v4bf16: +; GCN: ; %bb.0: +; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s5 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s7 +; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s6 +; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s1 +; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s0 +; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s3 +; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s2 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_bfe_u32 v7, v7, 16, 15 +; GCN-NEXT: v_bfe_u32 v6, v6, 16, 15 +; GCN-NEXT: v_bfe_u32 v5, v5, 16, 15 +; GCN-NEXT: v_bfe_u32 v4, v4, 16, 15 +; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 +; GCN-NEXT: v_or_b32_e32 v3, v7, v3 +; GCN-NEXT: v_or_b32_e32 v2, v6, v2 +; GCN-NEXT: v_or_b32_e32 v1, v5, v1 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v2 +; GCN-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: s_copysign_v4bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s7 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s6 +; GFX7-NEXT: v_mul_f32_e64 v6, 1.0, s3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e64 v7, 1.0, s2 +; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 15 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; GFX7-NEXT: v_bfe_u32 v7, v7, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s5 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s0 +; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s1 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; GFX7-NEXT: v_bfe_u32 v3, v5, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 +; GFX7-NEXT: v_bfe_u32 v3, v4, 16, 15 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v2 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_copysign_v4bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshr_b32 s3, s3, 16 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: s_lshr_b32 s1, s2, 16 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_bfi_b32 v2, s4, v2, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s1, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_copysign_v4bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshr_b32 s3, s3, 16 +; GFX9-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_lshr_b32 s1, s2, 16 +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_copysign_v4bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_lshr_b32 s3, s3, 16 +; GFX10-NEXT: s_lshr_b32 s2, s2, 16 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s1, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 +; GFX10-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2 +; GFX10-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_copysign_v4bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s1, v0 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 +; GFX11-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: ; return to shader part epilog + %out = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %arg_mag, <4 x bfloat> %arg_sign) + %cast = bitcast <4 x bfloat> %out to <2 x i32> + ret <2 x i32> %cast +} + +define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x bfloat> inreg %arg_sign) { +; GCN-LABEL: s_copysign_v8bf16: +; GCN: ; %bb.0: +; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s9 +; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s8 +; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s11 +; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s10 +; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s13 +; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s12 +; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s15 +; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s14 +; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s1 +; GCN-NEXT: v_mul_f32_e64 v9, 1.0, s0 +; GCN-NEXT: v_mul_f32_e64 v10, 1.0, s3 +; GCN-NEXT: v_mul_f32_e64 v11, 1.0, s2 +; GCN-NEXT: v_mul_f32_e64 v12, 1.0, s5 +; GCN-NEXT: v_mul_f32_e64 v13, 1.0, s4 +; GCN-NEXT: v_mul_f32_e64 v14, 1.0, s7 +; GCN-NEXT: v_mul_f32_e64 v15, 1.0, s6 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_bfe_u32 v15, v15, 16, 15 +; GCN-NEXT: v_bfe_u32 v14, v14, 16, 15 +; GCN-NEXT: v_bfe_u32 v13, v13, 16, 15 +; GCN-NEXT: v_bfe_u32 v12, v12, 16, 15 +; GCN-NEXT: v_bfe_u32 v11, v11, 16, 15 +; GCN-NEXT: v_bfe_u32 v10, v10, 16, 15 +; GCN-NEXT: v_bfe_u32 v9, v9, 16, 15 +; GCN-NEXT: v_bfe_u32 v8, v8, 16, 15 +; GCN-NEXT: v_and_b32_e32 v7, 0x8000, v7 +; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v6 +; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 +; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 +; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 +; GCN-NEXT: v_or_b32_e32 v7, v15, v7 +; GCN-NEXT: v_or_b32_e32 v6, v14, v6 +; GCN-NEXT: v_or_b32_e32 v5, v13, v5 +; GCN-NEXT: v_or_b32_e32 v4, v12, v4 +; GCN-NEXT: v_or_b32_e32 v3, v11, v3 +; GCN-NEXT: v_or_b32_e32 v2, v10, v2 +; GCN-NEXT: v_or_b32_e32 v1, v9, v1 +; GCN-NEXT: v_or_b32_e32 v0, v8, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: v_or_b3... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/142115 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits