https://github.com/whytolearn created https://github.com/llvm/llvm-project/pull/156822
[Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - add MMX/SSE/AVX PHADD/SUB & HADDPS/D intrinsics to be used in constexpr #155395 cover func: _mm_hadd_pi16 _mm_hadd_epi16 _mm256_hadd_epi16 _mm_hadd_pi32 _mm_hadd_epi32 _mm256_hadd_epi32 _mm_hadds_pi16 _mm_hadds_epi16 _mm256_hadds_epi16 _mm_hsub_pi16 _mm_hsub_epi16 _mm256_hsub_epi16 _mm_hsub_pi32 _mm_hsub_epi32 _mm256_hsub_epi32 _mm_hsubs_pi16 _mm_hsubs_epi16 _mm256_hsubs_epi16 _mm_hadd_pd _mm256_hadd_pd _mm_hadd_ps _mm256_hadd_ps _mm_hsub_pd _mm256_hsub_pd _mm_hsub_ps _mm256_hsub_ps >From a81c4068096b960de65c3517f18d2d31004afbce Mon Sep 17 00:00:00 2001 From: whyuuwang <[email protected]> Date: Thu, 4 Sep 2025 15:52:57 +0800 Subject: [PATCH 1/2] deal this issues 155395 VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - add MMX/SSE/AVX PHADD/SUB & HADDPS/D intrinsics to be used in constexpr #155395 --- clang/lib/Headers/avx2intrin.h | 27 ++++----- clang/lib/Headers/avxintrin.h | 11 ++-- clang/lib/Headers/pmmintrin.h | 20 +++---- clang/lib/Headers/tmmintrin.h | 80 +++++++++++-------------- clang/test/CodeGen/X86/avx-builtins.c | 29 +++++++++ clang/test/CodeGen/X86/avx2-builtins.c | 63 +++++++++++++++++++ clang/test/CodeGen/X86/mmx-builtins.c | 48 +++++++++++++++ clang/test/CodeGen/X86/ssse3-builtins.c | 49 +++++++++++++++ 8 files changed, 250 insertions(+), 77 deletions(-) diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index 384faa35d246f..f8fb808f7f29c 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -854,10 +854,9 @@ _mm256_cmpgt_epi64(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the sums. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_hadd_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b); +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_hadd_epi16(__m256i __a, __m256i __b) { + return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b); } /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit @@ -886,7 +885,7 @@ _mm256_hadd_epi16(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [8 x i32] containing one of the source operands. /// \returns A 256-bit vector of [8 x i32] containing the sums. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hadd_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b); @@ -921,10 +920,9 @@ _mm256_hadd_epi32(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the sums. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_hadds_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b); +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_hadds_epi16(__m256i __a, __m256i __b) { + return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b); } /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit @@ -957,10 +955,9 @@ _mm256_hadds_epi16(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the differences. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_hsub_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b); +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_hsub_epi16(__m256i __a, __m256i __b) { + return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b); } /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit @@ -989,7 +986,7 @@ _mm256_hsub_epi16(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [8 x i32] containing one of the source operands. /// \returns A 256-bit vector of [8 x i32] containing the differences. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hsub_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b); @@ -1025,7 +1022,7 @@ _mm256_hsub_epi32(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the differences. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hsubs_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b); diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h index 26096da949447..976710a64e80e 100644 --- a/clang/lib/Headers/avxintrin.h +++ b/clang/lib/Headers/avxintrin.h @@ -703,7 +703,7 @@ _mm256_xor_ps(__m256 __a, __m256 __b) /// elements of a vector of [4 x double]. /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of /// both operands. -static __inline __m256d __DEFAULT_FN_ATTRS +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_pd(__m256d __a, __m256d __b) { return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); @@ -726,9 +726,8 @@ _mm256_hadd_pd(__m256d __a, __m256d __b) /// index 2, 3, 6, 7 of a vector of [8 x float]. /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of /// both operands. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_hadd_ps(__m256 __a, __m256 __b) -{ +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm256_hadd_ps(__m256 __a, __m256 __b) { return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); } @@ -749,7 +748,7 @@ _mm256_hadd_ps(__m256 __a, __m256 __b) /// odd-indexed elements of a vector of [4 x double]. /// \returns A 256-bit vector of [4 x double] containing the horizontal /// differences of both operands. -static __inline __m256d __DEFAULT_FN_ATTRS +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_pd(__m256d __a, __m256d __b) { return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); @@ -772,7 +771,7 @@ _mm256_hsub_pd(__m256d __a, __m256d __b) /// elements with index 2, 3, 6, 7 of a vector of [8 x float]. /// \returns A 256-bit vector of [8 x float] containing the horizontal /// differences of both operands. -static __inline __m256 __DEFAULT_FN_ATTRS +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_ps(__m256 __a, __m256 __b) { return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b); diff --git a/clang/lib/Headers/pmmintrin.h b/clang/lib/Headers/pmmintrin.h index cd605df7fb52d..400b28bb877a1 100644 --- a/clang/lib/Headers/pmmintrin.h +++ b/clang/lib/Headers/pmmintrin.h @@ -89,9 +89,8 @@ _mm_addsub_ps(__m128 __a, __m128 __b) /// destination. /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of /// both operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_hadd_ps(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_hadd_ps(__m128 __a, __m128 __b) { return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b); } @@ -174,9 +173,8 @@ _mm_moveldup_ps(__m128 __a) /// A 128-bit vector of [2 x double] containing the right source operand. /// \returns A 128-bit vector of [2 x double] containing the alternating sums /// and differences of both operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_addsub_pd(__m128d __a, __m128d __b) -{ +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_addsub_pd(__m128d __a, __m128d __b) { return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b); } @@ -197,9 +195,8 @@ _mm_addsub_pd(__m128d __a, __m128d __b) /// destination. /// \returns A 128-bit vector of [2 x double] containing the horizontal sums of /// both operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_hadd_pd(__m128d __a, __m128d __b) -{ +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_hadd_pd(__m128d __a, __m128d __b) { return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b); } @@ -220,9 +217,8 @@ _mm_hadd_pd(__m128d __a, __m128d __b) /// the destination. /// \returns A 128-bit vector of [2 x double] containing the horizontal /// differences of both operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_hsub_pd(__m128d __a, __m128d __b) -{ +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_hsub_pd(__m128d __a, __m128d __b) { return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b); } diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h index f01c61afa8ea2..d79f7f6ea4091 100644 --- a/clang/lib/Headers/tmmintrin.h +++ b/clang/lib/Headers/tmmintrin.h @@ -204,10 +204,10 @@ _mm_abs_epi32(__m128i __a) { /// destination. /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of /// both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hadd_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_hadd_epi16(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_phaddw128( + (__v8hi)__a, (__v8hi)__b); } /// Horizontally adds the adjacent pairs of values contained in 2 packed @@ -227,10 +227,9 @@ _mm_hadd_epi16(__m128i __a, __m128i __b) /// destination. /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of /// both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hadd_epi32(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_hadd_epi32(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b); } /// Horizontally adds the adjacent pairs of values contained in 2 packed @@ -250,11 +249,10 @@ _mm_hadd_epi32(__m128i __a, __m128i __b) /// destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both /// operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_hadd_pi16(__m64 __a, __m64 __b) -{ - return __trunc64(__builtin_ia32_phaddw128( - (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR + _mm_hadd_pi16(__m64 __a, __m64 __b) { + return __trunc64(__builtin_ia32_phaddw128( + (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); } /// Horizontally adds the adjacent pairs of values contained in 2 packed @@ -274,7 +272,7 @@ _mm_hadd_pi16(__m64 __a, __m64 __b) /// destination. /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both /// operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_pi32(__m64 __a, __m64 __b) { return __trunc64(__builtin_ia32_phaddd128( @@ -301,10 +299,9 @@ _mm_hadd_pi32(__m64 __a, __m64 __b) /// destination. /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated /// sums of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hadds_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_hadds_epi16(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b); } /// Horizontally adds, with saturation, the adjacent pairs of values contained @@ -327,7 +324,7 @@ _mm_hadds_epi16(__m128i __a, __m128i __b) /// destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated /// sums of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadds_pi16(__m64 __a, __m64 __b) { return __trunc64(__builtin_ia32_phaddsw128( @@ -351,10 +348,9 @@ _mm_hadds_pi16(__m64 __a, __m64 __b) /// the destination. /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences /// of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hsub_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_hsub_epi16(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b); } /// Horizontally subtracts the adjacent pairs of values contained in 2 @@ -374,10 +370,9 @@ _mm_hsub_epi16(__m128i __a, __m128i __b) /// the destination. /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences /// of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hsub_epi32(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_hsub_epi32(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b); } /// Horizontally subtracts the adjacent pairs of values contained in 2 @@ -397,7 +392,7 @@ _mm_hsub_epi32(__m128i __a, __m128i __b) /// the destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences /// of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsub_pi16(__m64 __a, __m64 __b) { return __trunc64(__builtin_ia32_phsubw128( @@ -421,7 +416,7 @@ _mm_hsub_pi16(__m64 __a, __m64 __b) /// the destination. /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences /// of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsub_pi32(__m64 __a, __m64 __b) { return __trunc64(__builtin_ia32_phsubd128( @@ -448,10 +443,9 @@ _mm_hsub_pi32(__m64 __a, __m64 __b) /// the destination. /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated /// differences of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hsubs_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_hsubs_epi16(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b); } /// Horizontally subtracts, with saturation, the adjacent pairs of values @@ -474,7 +468,7 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b) /// the destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated /// differences of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsubs_pi16(__m64 __a, __m64 __b) { return __trunc64(__builtin_ia32_phsubsw128( @@ -509,10 +503,9 @@ _mm_hsubs_pi16(__m64 __a, __m64 __b) /// \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n /// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n /// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15) -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maddubs_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_maddubs_epi16(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b); } /// Multiplies corresponding pairs of packed 8-bit unsigned integer @@ -539,11 +532,10 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b) /// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n /// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n /// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_maddubs_pi16(__m64 __a, __m64 __b) -{ - return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a), - (__v16qi)__anyext128(__b))); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_maddubs_pi16(__m64 __a, __m64 __b) { + return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a), + (__v16qi)__anyext128(__b))); } /// Multiplies packed 16-bit signed integer values, truncates the 32-bit @@ -560,7 +552,7 @@ _mm_maddubs_pi16(__m64 __a, __m64 __b) /// A 128-bit vector of [8 x i16] containing one of the source operands. /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled /// products of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mulhrs_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b); diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c index 4a048744faa61..f381faebededf 100644 --- a/clang/test/CodeGen/X86/avx-builtins.c +++ b/clang/test/CodeGen/X86/avx-builtins.c @@ -1083,24 +1083,53 @@ __m256d test_mm256_hadd_pd(__m256d A, __m256d B) { // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_hadd_pd(A, B); } +constexpr bool test_mm256_hadd_epi32_constexpr() { + constexpr __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0); + constexpr __m256d b = _mm256_set_pd(5.0, 6.0, 7.0, 8.0); + constexpr __m256d result = _mm256_hadd_pd(a, b); + return match_m256d(result,1.0+2.0,3.0+4.0,5.0+6.0,7.0+8.0); +} +TEST_CONSTEXPR(test_mm256_hadd_epi32_constexpr()) __m256 test_mm256_hadd_ps(__m256 A, __m256 B) { // CHECK-LABEL: test_mm256_hadd_ps // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_hadd_ps(A, B); } +constexpr bool test_mm256_hadd_ps_constexpr() { + constexpr __m256 a = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); + constexpr __m256 b = _mm256_set_ps(9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f); + constexpr __m256 result = _mm256_hadd_ps(a, b); + return match_m256(result,1.0f+2.0f,3.0f+4.0f,5.0f+6.0f,7.0f+8.0f, + 9.0f+10.0f,11.0f+12.0f,13.0f+14.0f,15.0f+16.0f); +} +TEST_CONSTEXPR(test_mm256_hadd_ps_constexpr()) __m256d test_mm256_hsub_pd(__m256d A, __m256d B) { // CHECK-LABEL: test_mm256_hsub_pd // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_hsub_pd(A, B); } +constexpr bool test_mm256_hsub_pd_constexpr() { + constexpr __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0); + constexpr __m256d b = _mm256_set_pd(5.0, 6.0, 7.0, 8.0); + constexpr __m256d result = _mm256_hsub_pd(a, b); + return match_m256d(result,1.0-2.0,3.0-4.0,5.0-6.0,7.0-8.0); +} +TEST_CONSTEXPR(test_mm256_hsub_pd_constexpr()) __m256 test_mm256_hsub_ps(__m256 A, __m256 B) { // CHECK-LABEL: test_mm256_hsub_ps // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_hsub_ps(A, B); } +constexpr bool test_mm256_hsub_ps_constexpr() { + constexpr __m256 a = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); + constexpr __m256 b = _mm256_set_ps(9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f); + constexpr __m256 result = _mm256_hsub_ps(a, b); + return match_m256(result,1.0f-2.0f,3.0f-4.0f,5.0f-6.0f,7.0f-8.0f, + 9.0f-10.0f,11.0f-12.0f,13.0f-14.0f,15.0f-16.0f); +} __m256i test_mm256_insert_epi8(__m256i x, char b) { // CHECK-LABEL: test_mm256_insert_epi8 diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index a39ce513837ea..02845b9417a1f 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -462,17 +462,48 @@ __m256i test_mm256_hadd_epi16(__m256i a, __m256i b) { return _mm256_hadd_epi16(a, b); } +constexpr bool test_mm256_hadd_epi16_constexpr() { + constexpr __m256i a = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, + 8,9,10,11,12,13,14,15,16); + constexpr __m256i b = _mm256_setr_epi16(17,18,19,20,21,22,23, + 24,25,26,27,28,29,30,31,32); + + constexpr __m256i result = _mm256_hadd_epi16(a, b); + return match_v16si(result,1+2,3+4,5+6,7+8,9+10,11+12,13+14,15+16,17+18,19+20,21+22,23+24,25+26,27+28,29+30,31+32); +} +TEST_CONSTEXPR(test_mm256_hadd_epi16_constexpr()) + __m256i test_mm256_hadd_epi32(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_hadd_epi32 // CHECK: call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}) return _mm256_hadd_epi32(a, b); } +constexpr bool test_mm256_hadd_epi32_constexpr() { + constexpr __m256i a = _mm256_setr_epi32(10, 20, 30, 40, 50, 60, 70, 80); + constexpr __m256i b = _mm256_setr_epi32(5, 15, 25, 35, 45, 55, 65, 75); + + constexpr __m256i result = _mm256_hadd_epi32(a, b); + return match_v8si(result,10+20,30+40,50+60,70+80,5+15,25+35, 45+55,65+75); +} +TEST_CONSTEXPR(test_mm256_hadd_epi32_constexpr()) + __m256i test_mm256_hadds_epi16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_hadds_epi16 // CHECK:call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_hadds_epi16(a, b); } +constexpr bool test_mm256_hadds_epi16_constexpr() { + constexpr __m256i a = _mm256_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, + 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); + constexpr __m256i b = _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1); + constexpr __m256i result = _mm256_hadds_epi16(a, b); + + return match_v16si(result, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, + 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); +} +TEST_CONSTEXPR(test_mm256_hadds_epi16_constexpr()) __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_hsub_epi16 @@ -480,18 +511,50 @@ __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) { return _mm256_hsub_epi16(a, b); } +constexpr bool test_mm256_hsub_epi16_constexpr() { + constexpr __m256i a = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, + 8,9,10,11,12,13,14,15,16); + constexpr __m256i b = _mm256_setr_epi16(17,18,19,20,21,22,23, + 24,25,26,27,28,29,30,31,32); + + constexpr __m256i result = _mm256_hsub_epi16(a, b); + return match_v16si(result,1-2,3-4,5-6,7-8,9-10,11-12,13-14,15-16,17-18,19-20,21-22,23-24,25-26,27-28,29-30,31-32); +} +TEST_CONSTEXPR(test_mm256_hsub_epi16_constexpr()) + __m256i test_mm256_hsub_epi32(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_hsub_epi32 // CHECK: call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}) return _mm256_hsub_epi32(a, b); } +constexpr bool test_mm256_hsub_epi32_constexpr() { + constexpr __m256i a = _mm256_setr_epi32(10, 20, 30, 40, 50, 60, 70, 80); + constexpr __m256i b = _mm256_setr_epi32(5, 15, 25, 35, 45, 55, 65, 75); + + constexpr __m256i result = _mm256_hsub_epi32(a, b); + return match_v8si(result,10-20,30-40,50-60,70-80,5-15,25-35, 45-55,65-75); +} +TEST_CONSTEXPR(test_mm256_hsub_epi32_constexpr()) + __m256i test_mm256_hsubs_epi16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_hsubs_epi16 // CHECK:call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_hsubs_epi16(a, b); } +constexpr bool test_mm256_hsubs_epi16_constexpr() { + constexpr __m256i a = _mm256_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, + 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); + constexpr __m256i b = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + constexpr __m256i result3 = _mm256_hsubs_epi16(a, b); + + return match_v16si(result3, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, + 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); +} +TEST_CONSTEXPR(test_mm256_hsubs_epi16_constexpr()) + + __m128i test_mm_i32gather_epi32(int const *b, __m128i c) { // CHECK-LABEL: test_mm_i32gather_epi32 // CHECK: call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %{{.*}}, ptr %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i8 2) diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c index 7bd2475399bf9..8da0e8c814879 100644 --- a/clang/test/CodeGen/X86/mmx-builtins.c +++ b/clang/test/CodeGen/X86/mmx-builtins.c @@ -309,36 +309,84 @@ __m64 test_mm_hadd_pi16(__m64 a, __m64 b) { // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128( return _mm_hadd_pi16(a, b); } +constexpr bool test_mm_hadd_pi16_constexpr() { + constexpr __m64 a = _mm_setr_pi16(1, 2, 3, 4); + constexpr __m64 b = _mm_setr_pi16(5,6,7,8); + + constexpr __m64 result = _mm_hadd_pi16(a, b); + return match_v4si(result,1+2,3+4,5+6,7+8); +} +TEST_CONSTEXPR(test_mm_hadd_pi16_constexpr()) __m64 test_mm_hadd_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hadd_pi32 // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128( return _mm_hadd_pi32(a, b); } +constexpr bool test_mm_hadd_pi32_constexpr() { + constexpr __m64 a = _mm_setr_pi32(1, 2); + constexpr __m64 b = _mm_setr_pi32(3, 4); + + constexpr __m64 result = _mm_hadd_pi32(a, b); + return match_v2si(result,1+2,3+4); +} +TEST_CONSTEXPR(test_mm_hadd_pi32_constexpr()) __m64 test_mm_hadds_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hadds_pi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128( return _mm_hadds_pi16(a, b); } +constexpr bool test_mm_hadds_pi16_constexpr() { + constexpr __m64 a = _mm_setr_pi16(32767, 32767, 32767, 32767); + constexpr __m64 b = _mm_setr_pi16(1,1,1,1); + + constexpr __m64 result = _mm_hadds_pi16(a, b); + return match_v4si(result,32767, 32767, 32767, 32767); +} +TEST_CONSTEXPR(test_mm_hadds_pi16_constexpr()) __m64 test_mm_hsub_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hsub_pi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128( return _mm_hsub_pi16(a, b); } +constexpr bool test_mm_hsub_pi16_constexpr() { + constexpr __m64 a = _mm_setr_pi16(1, 2, 3, 4); + constexpr __m64 b = _mm_setr_pi16(5,6,7,8); + + constexpr __m64 result = _mm_hsub_pi16(a, b); + return match_v4si(result,1-2,3-4,5-6,7-8); +} +TEST_CONSTEXPR(test_mm_hsub_pi16_constexpr()) __m64 test_mm_hsub_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hsub_pi32 // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128( return _mm_hsub_pi32(a, b); } +constexpr bool test_mm_hsub_pi32_constexpr() { + constexpr __m64 a = _mm_setr_pi32(1, 2); + constexpr __m64 b = _mm_setr_pi32(3, 4); + + constexpr __m64 result = _mm_hsub_pi32(a, b); + return match_v2si(result,1-2,3-4); +} +TEST_CONSTEXPR(test_mm_hsub_pi32_constexpr()) __m64 test_mm_hsubs_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hsubs_pi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128( return _mm_hsubs_pi16(a, b); } +constexpr bool test_mm_hsubs_pi16_constexpr() { + constexpr __m64 a = _mm_setr_pi16(32767, 32767, 32767, 32767); + constexpr __m64 b = _mm_setr_pi16(-1,-1,-1,-1); + + constexpr __m64 result = _mm_hsubs_pi16(a, b); + return match_v4si(result,32767, 32767, 32767, 32767); +} +TEST_CONSTEXPR(test_mm_hsubs_pi16_constexpr()) __m64 test_mm_insert_pi16(__m64 a, int d) { // CHECK-LABEL: test_mm_insert_pi16 diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c index 56ff73f08ab32..bd0ef43278217 100644 --- a/clang/test/CodeGen/X86/ssse3-builtins.c +++ b/clang/test/CodeGen/X86/ssse3-builtins.c @@ -60,36 +60,85 @@ __m128i test_mm_hadd_epi16(__m128i a, __m128i b) { // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_hadd_epi16(a, b); } +constexpr bool test_mm_hadd_epi16_constexpr() { + constexpr __m128i a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + constexpr __m128i b = _mm_setr_epi16(17,18,19,20,21,22,23,24); + + constexpr __m128i result = _mm_hadd_epi16(a, b); + return match_v8si(result,1+2,3+4,5+6,7+8,17+18,19+20,21+22,23+24); +} +TEST_CONSTEXPR(test_mm_hadd_epi16_constexpr()) __m128i test_mm_hadd_epi32(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_hadd_epi32 // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) return _mm_hadd_epi32(a, b); } +constexpr bool test_mm_hadd_epi32_constexpr() { + constexpr __m128i a = _mm_setr_epi32(1, 2, 3, 4); + constexpr __m128i b = _mm_setr_epi32(5,6,7,8); + + constexpr __m128i result = _mm_hadd_epi32(a, b); + return match_v4si(result,1+2,3+4,5+6,7+8); +} +TEST_CONSTEXPR(test_mm_hadd_epi32_constexpr()) __m128i test_mm_hadds_epi16(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_hadds_epi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_hadds_epi16(a, b); } +constexpr bool test_mm_hadds_epi16_constexpr() { + constexpr __m128i a = _mm_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); + constexpr __m128i b = _mm_setr_epi16(1, 1, 1, 1, 1, 1, 1, 1); + constexpr __m128i result = _mm_hadds_epi16(a, b); + + return match_v8si(result, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); +} +TEST_CONSTEXPR(test_mm_hadds_epi16_constexpr()) + __m128i test_mm_hsub_epi16(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_hsub_epi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_hsub_epi16(a, b); } +constexpr bool test_mm_hsub_epi16_constexpr() { + constexpr __m128i a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + constexpr __m128i b = _mm_setr_epi16(9,10,11,12,13,14,15,16); + + constexpr __m128i result = _mm_hsub_epi16(a, b); + return match_v8si(result,1-2,3-4,5-6,7-8,9-10,11-12,13-14,15-16); +} +TEST_CONSTEXPR(test_mm_hsub_epi16_constexpr()) __m128i test_mm_hsub_epi32(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_hsub_epi32 // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) return _mm_hsub_epi32(a, b); } +constexpr bool test_mm_hsub_epi32_constexpr() { + constexpr __m128i a = _mm_setr_epi32(1, 2, 3, 4); + constexpr __m128i b = _mm_setr_epi32(5,6,7,8); + + constexpr __m128i result = _mm_hsub_epi32(a, b); + return match_v4si(result,1-2,3-4,5-6,7-8); +} +TEST_CONSTEXPR(test_mm_hsub_epi32_constexpr()) __m128i test_mm_hsubs_epi16(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_hsubs_epi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_hsubs_epi16(a, b); } +constexpr bool test_mm_hsubs_epi16_constexpr() { + constexpr __m128i a = _mm_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); + constexpr __m128i b = _mm_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1); + constexpr __m128i result3 = _mm_hsubs_epi16(a, b); + + return match_v8si(result3, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); +} +TEST_CONSTEXPR(test_mm_hsubs_epi16_constexpr()) __m128i test_mm_maddubs_epi16(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_maddubs_epi16 >From 2fadf3fd261935e25adff5b26ad8ee0734746a26 Mon Sep 17 00:00:00 2001 From: whyuuwang <[email protected]> Date: Thu, 4 Sep 2025 15:55:44 +0800 Subject: [PATCH 2/2] deal issues 15595 [Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - add MMX/SSE/AVX PHADD/SUB & HADDPS/D intrinsics to be used in constexpr #155395 --- clang/lib/Headers/avx2intrin.h | 15 ++++----- clang/lib/Headers/avxintrin.h | 15 ++++----- clang/lib/Headers/pmmintrin.h | 4 +-- clang/lib/Headers/tmmintrin.h | 57 +++++++++++++++------------------- 4 files changed, 39 insertions(+), 52 deletions(-) diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index f8fb808f7f29c..c39f94c7fc16b 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -886,9 +886,8 @@ _mm256_hadd_epi16(__m256i __a, __m256i __b) { /// A 256-bit vector of [8 x i32] containing one of the source operands. /// \returns A 256-bit vector of [8 x i32] containing the sums. static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_hadd_epi32(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b); +_mm256_hadd_epi32(__m256i __a, __m256i __b) { + return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b); } /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit @@ -987,9 +986,8 @@ _mm256_hsub_epi16(__m256i __a, __m256i __b) { /// A 256-bit vector of [8 x i32] containing one of the source operands. /// \returns A 256-bit vector of [8 x i32] containing the differences. static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_hsub_epi32(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b); +_mm256_hsub_epi32(__m256i __a, __m256i __b) { + return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b); } /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit @@ -1023,9 +1021,8 @@ _mm256_hsub_epi32(__m256i __a, __m256i __b) /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the differences. static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_hsubs_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b); +_mm256_hsubs_epi16(__m256i __a, __m256i __b) { + return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b); } /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h index 976710a64e80e..48d79063f9b61 100644 --- a/clang/lib/Headers/avxintrin.h +++ b/clang/lib/Headers/avxintrin.h @@ -704,8 +704,7 @@ _mm256_xor_ps(__m256 __a, __m256 __b) /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of /// both operands. static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR -_mm256_hadd_pd(__m256d __a, __m256d __b) -{ +_mm256_hadd_pd(__m256d __a, __m256d __b) { return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); } @@ -726,8 +725,8 @@ _mm256_hadd_pd(__m256d __a, __m256d __b) /// index 2, 3, 6, 7 of a vector of [8 x float]. /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of /// both operands. -static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR -_mm256_hadd_ps(__m256 __a, __m256 __b) { +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_ps(__m256 __a, + __m256 __b) { return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); } @@ -749,8 +748,7 @@ _mm256_hadd_ps(__m256 __a, __m256 __b) { /// \returns A 256-bit vector of [4 x double] containing the horizontal /// differences of both operands. static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR -_mm256_hsub_pd(__m256d __a, __m256d __b) -{ +_mm256_hsub_pd(__m256d __a, __m256d __b) { return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); } @@ -771,9 +769,8 @@ _mm256_hsub_pd(__m256d __a, __m256d __b) /// elements with index 2, 3, 6, 7 of a vector of [8 x float]. /// \returns A 256-bit vector of [8 x float] containing the horizontal /// differences of both operands. -static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR -_mm256_hsub_ps(__m256 __a, __m256 __b) -{ +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_ps(__m256 __a, + __m256 __b) { return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b); } diff --git a/clang/lib/Headers/pmmintrin.h b/clang/lib/Headers/pmmintrin.h index 400b28bb877a1..67f2a7ffd1f56 100644 --- a/clang/lib/Headers/pmmintrin.h +++ b/clang/lib/Headers/pmmintrin.h @@ -89,8 +89,8 @@ _mm_addsub_ps(__m128 __a, __m128 __b) /// destination. /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of /// both operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR -_mm_hadd_ps(__m128 __a, __m128 __b) { +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_ps(__m128 __a, + __m128 __b) { return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b); } diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h index d79f7f6ea4091..b408c6a3404ec 100644 --- a/clang/lib/Headers/tmmintrin.h +++ b/clang/lib/Headers/tmmintrin.h @@ -206,8 +206,7 @@ _mm_abs_epi32(__m128i __a) { /// both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_epi16(__m128i __a, __m128i __b) { - return (__m128i)__builtin_ia32_phaddw128( - (__v8hi)__a, (__v8hi)__b); + return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b); } /// Horizontally adds the adjacent pairs of values contained in 2 packed @@ -249,8 +248,8 @@ _mm_hadd_epi32(__m128i __a, __m128i __b) { /// destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both /// operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR - _mm_hadd_pi16(__m64 __a, __m64 __b) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_pi16(__m64 __a, + __m64 __b) { return __trunc64(__builtin_ia32_phaddw128( (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); } @@ -272,11 +271,10 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR /// destination. /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both /// operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR -_mm_hadd_pi32(__m64 __a, __m64 __b) -{ - return __trunc64(__builtin_ia32_phaddd128( - (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){})); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_pi32(__m64 __a, + __m64 __b) { + return __trunc64(__builtin_ia32_phaddd128( + (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){})); } /// Horizontally adds, with saturation, the adjacent pairs of values contained @@ -324,11 +322,10 @@ _mm_hadds_epi16(__m128i __a, __m128i __b) { /// destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated /// sums of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR -_mm_hadds_pi16(__m64 __a, __m64 __b) -{ - return __trunc64(__builtin_ia32_phaddsw128( - (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadds_pi16(__m64 __a, + __m64 __b) { + return __trunc64(__builtin_ia32_phaddsw128( + (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); } /// Horizontally subtracts the adjacent pairs of values contained in 2 @@ -392,11 +389,10 @@ _mm_hsub_epi32(__m128i __a, __m128i __b) { /// the destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences /// of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR -_mm_hsub_pi16(__m64 __a, __m64 __b) -{ - return __trunc64(__builtin_ia32_phsubw128( - (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsub_pi16(__m64 __a, + __m64 __b) { + return __trunc64(__builtin_ia32_phsubw128( + (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); } /// Horizontally subtracts the adjacent pairs of values contained in 2 @@ -416,11 +412,10 @@ _mm_hsub_pi16(__m64 __a, __m64 __b) /// the destination. /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences /// of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR -_mm_hsub_pi32(__m64 __a, __m64 __b) -{ - return __trunc64(__builtin_ia32_phsubd128( - (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){})); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsub_pi32(__m64 __a, + __m64 __b) { + return __trunc64(__builtin_ia32_phsubd128( + (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){})); } /// Horizontally subtracts, with saturation, the adjacent pairs of values @@ -468,11 +463,10 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b) { /// the destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated /// differences of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR -_mm_hsubs_pi16(__m64 __a, __m64 __b) -{ - return __trunc64(__builtin_ia32_phsubsw128( - (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsubs_pi16(__m64 __a, + __m64 __b) { + return __trunc64(__builtin_ia32_phsubsw128( + (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); } /// Multiplies corresponding pairs of packed 8-bit unsigned integer @@ -553,9 +547,8 @@ _mm_maddubs_pi16(__m64 __a, __m64 __b) { /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled /// products of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR -_mm_mulhrs_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b); +_mm_mulhrs_epi16(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b); } /// Multiplies packed 16-bit signed integer values, truncates the 32-bit _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
