llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: Simon Pilgrim (RKSimon) <details> <summary>Changes</summary> Let standard casting / builtin_convertvector handle the conversions from BF16 to F32 My only query is how to best implement _mm_cvtpbh_ps - I went for the v8bf16 -> v8f32 conversion followed by subvector extraction in the end, but could just as easily extract a v4bf16 first - makes no difference to final codegen. First part of #<!-- -->154911 --- Full diff: https://github.com/llvm/llvm-project/pull/169781.diff 6 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsX86.td (-4) - (modified) clang/lib/CodeGen/TargetBuiltins/X86.cpp (-2) - (modified) clang/lib/Headers/avx512bf16intrin.h (+7-8) - (modified) clang/lib/Headers/avx512vlbf16intrin.h (+12-14) - (modified) clang/test/CodeGen/X86/avx512bf16-builtins.c (+5-8) - (modified) clang/test/CodeGen/X86/avx512vlbf16-builtins.c (+13-16) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index fcc3957f9b8ab..f0991a1d82dfc 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -3358,10 +3358,6 @@ let Features = "avx512bf16", Attributes = [NoThrow, Const, RequiredVectorWidth<5 def dpbf16ps_512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<32, __bf16>, _Vector<32, __bf16>)">; } -let Features = "avx512bf16", Attributes = [NoThrow, Const] in { - def cvtsbf162ss_32 : X86Builtin<"float(__bf16)">; -} - let Features = "avx512vp2intersect", Attributes = [NoThrow, RequiredVectorWidth<512>] in { def vp2intersect_q_512 : X86Builtin<"void(_Vector<8, long long int>, _Vector<8, long long int>, unsigned char *, unsigned char *)">; } diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index 00c8a1cf16e31..1b4ab08588602 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -2796,8 +2796,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, Intrinsic::ID IID = Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128; return Builder.CreateCall(CGM.getIntrinsic(IID), Ops); } - case X86::BI__builtin_ia32_cvtsbf162ss_32: - return Builder.CreateFPExt(Ops[0], Builder.getFloatTy()); case X86::BI__builtin_ia32_cvtneps2bf16_256_mask: case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: { diff --git a/clang/lib/Headers/avx512bf16intrin.h b/clang/lib/Headers/avx512bf16intrin.h index 3973f0e389685..9a21d8680045a 100644 --- a/clang/lib/Headers/avx512bf16intrin.h +++ b/clang/lib/Headers/avx512bf16intrin.h @@ -36,7 +36,7 @@ typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead"))); /// \returns A float data whose sign field and exponent field keep unchanged, /// and fraction field is extended to 23 bits. static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) { - return __builtin_ia32_cvtsbf162ss_32(__A); + return (float)(__A); } /// Convert Two Packed Single Data to One Packed BF16 Data. @@ -236,8 +236,7 @@ _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) { /// A 256-bit vector of [16 x bfloat]. /// \returns A 512-bit vector of [16 x float] come from conversion of __A static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) { - return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32( - (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16)); + return (__m512) __builtin_convertvector(__A, __v16sf); } /// Convert Packed BF16 Data to Packed float Data using zeroing mask. @@ -252,8 +251,9 @@ static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) { /// \returns A 512-bit vector of [16 x float] come from conversion of __A static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) { - return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32( - (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16)); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_cvtpbh_ps(__A), + (__v16sf)_mm512_setzero_ps()); } /// Convert Packed BF16 Data to Packed float Data using merging mask. @@ -270,9 +270,8 @@ _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) { /// \returns A 512-bit vector of [16 x float] come from conversion of __A static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) { - return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32( - (__m512i)__S, (__mmask16)__U, - (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16)); + return (__m512)__builtin_ia32_selectps_512( + (__mmask16)__U, (__v16sf)_mm512_cvtpbh_ps(__A), (__v16sf)__S); } #undef __DEFAULT_FN_ATTRS diff --git a/clang/lib/Headers/avx512vlbf16intrin.h b/clang/lib/Headers/avx512vlbf16intrin.h index 2d7ea0114d6a5..0e3184554a6c8 100644 --- a/clang/lib/Headers/avx512vlbf16intrin.h +++ b/clang/lib/Headers/avx512vlbf16intrin.h @@ -422,8 +422,8 @@ static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) { /// A 128-bit vector of [4 x bfloat]. /// \returns A 128-bit vector of [4 x float] come from conversion of __A static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) { - return _mm_castsi128_ps( - (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16)); + return (__m128)_mm256_castps256_ps128( + (__m256) __builtin_convertvector(__A, __v8sf)); } /// Convert Packed BF16 Data to Packed float Data. @@ -434,8 +434,7 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) { /// A 128-bit vector of [8 x bfloat]. /// \returns A 256-bit vector of [8 x float] come from conversion of __A static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) { - return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32( - (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16)); + return (__m256) __builtin_convertvector(__A, __v8sf); } /// Convert Packed BF16 Data to Packed float Data using zeroing mask. @@ -450,8 +449,8 @@ static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) { /// \returns A 128-bit vector of [4 x float] come from conversion of __A static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { - return _mm_castsi128_ps((__m128i)_mm_slli_epi32( - (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16)); + return (__m128)__builtin_ia32_selectps_128( + (__mmask8)__U, (__v4sf)_mm_cvtpbh_ps(__A), (__v4sf)_mm_setzero_ps()); } /// Convert Packed BF16 Data to Packed float Data using zeroing mask. @@ -466,8 +465,9 @@ _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { /// \returns A 256-bit vector of [8 x float] come from conversion of __A static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { - return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32( - (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16)); + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_cvtpbh_ps(__A), + (__v8sf)_mm256_setzero_ps()); } /// Convert Packed BF16 Data to Packed float Data using merging mask. @@ -485,9 +485,8 @@ _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { /// \returns A 128-bit vector of [4 x float] come from conversion of __A static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) { - return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32( - (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A), - 16)); + return (__m128)__builtin_ia32_selectps_128( + (__mmask8)__U, (__v4sf)_mm_cvtpbh_ps(__A), (__v4sf)__S); } /// Convert Packed BF16 Data to Packed float Data using merging mask. @@ -505,9 +504,8 @@ _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) { /// \returns A 256-bit vector of [8 x float] come from conversion of __A static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) { - return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32( - (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), - 16)); + return (__m256)__builtin_ia32_selectps_256( + (__mmask8)__U, (__v8sf)_mm256_cvtpbh_ps(__A), (__v8sf)__S); } #undef __DEFAULT_FN_ATTRS128 diff --git a/clang/test/CodeGen/X86/avx512bf16-builtins.c b/clang/test/CodeGen/X86/avx512bf16-builtins.c index 3f544d387f7aa..dfaae4e459f7e 100644 --- a/clang/test/CodeGen/X86/avx512bf16-builtins.c +++ b/clang/test/CodeGen/X86/avx512bf16-builtins.c @@ -79,23 +79,20 @@ __m512 test_mm512_mask_dpbf16_ps(__m512 D, __m512bh A, __m512bh B, __mmask16 U) __m512 test_mm512_cvtpbh_ps(__m256bh A) { // CHECK-LABEL: test_mm512_cvtpbh_ps - // CHECK: sext <16 x i16> %{{.*}} to <16 x i32> - // CHECK: call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %{{.*}}, i32 %{{.*}}) + // CHECK: fpext <16 x bfloat> %{{.*}} to <16 x float> return _mm512_cvtpbh_ps(A); } __m512 test_mm512_maskz_cvtpbh_ps(__mmask16 M, __m256bh A) { // CHECK-LABEL: test_mm512_maskz_cvtpbh_ps - // CHECK: sext <16 x i16> %{{.*}} to <16 x i32> - // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} - // CHECK: call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %{{.*}}, i32 %{{.*}}) + // CHECK: fpext <16 x bfloat> %{{.*}} to <16 x float> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_cvtpbh_ps(M, A); } __m512 test_mm512_mask_cvtpbh_ps(__m512 S, __mmask16 M, __m256bh A) { // CHECK-LABEL: test_mm512_mask_cvtpbh_ps - // CHECK: sext <16 x i16> %{{.*}} to <16 x i32> - // CHECK: call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %{{.*}}, i32 %{{.*}}) - // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} + // CHECK: fpext <16 x bfloat> %{{.*}} to <16 x float> + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_cvtpbh_ps(S, M, A); } diff --git a/clang/test/CodeGen/X86/avx512vlbf16-builtins.c b/clang/test/CodeGen/X86/avx512vlbf16-builtins.c index d59b254520774..80afcd7a490db 100644 --- a/clang/test/CodeGen/X86/avx512vlbf16-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlbf16-builtins.c @@ -156,46 +156,43 @@ __bf16 test_mm_cvtness_sbh(float A) { __m128 test_mm_cvtpbh_ps(__m128bh A) { // CHECK-LABEL: test_mm_cvtpbh_ps - // CHECK: sext <4 x i16> %{{.*}} to <4 x i32> - // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}}) + // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float> + // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3> return _mm_cvtpbh_ps(A); } __m256 test_mm256_cvtpbh_ps(__m128bh A) { // CHECK-LABEL: test_mm256_cvtpbh_ps - // CHECK: sext <8 x i16> %{{.*}} to <8 x i32> - // CHECK: call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %{{.*}}, i32 %{{.*}}) + // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float> return _mm256_cvtpbh_ps(A); } __m128 test_mm_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) { // CHECK-LABEL: test_mm_maskz_cvtpbh_ps - // CHECK: sext <4 x i16> %{{.*}} to <4 x i32> - // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} - // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}}) + // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float> + // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_cvtpbh_ps(M, A); } __m256 test_mm256_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) { // CHECK-LABEL: test_mm256_maskz_cvtpbh_ps - // CHECK: sext <8 x i16> %{{.*}} to <8 x i32> - // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} - // CHECK: call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %{{.*}}, i32 %{{.*}}) + // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_cvtpbh_ps(M, A); } __m128 test_mm_mask_cvtpbh_ps(__m128 S, __mmask8 M, __m128bh A) { // CHECK-LABEL: test_mm_mask_cvtpbh_ps - // CHECK: sext <4 x i16> %{{.*}} to <4 x i32> - // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}}) - // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} + // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float> + // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_cvtpbh_ps(S, M, A); } __m256 test_mm256_mask_cvtpbh_ps(__m256 S, __mmask8 M, __m128bh A) { // CHECK-LABEL: test_mm256_mask_cvtpbh_ps - // CHECK: sext <8 x i16> %{{.*}} to <8 x i32> - // CHECK: call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %{{.*}}, i32 %{{.*}}) - // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} + // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float> + // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_cvtpbh_ps(S, M, A); } `````````` </details> https://github.com/llvm/llvm-project/pull/169781 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
