https://github.com/whytolearn updated https://github.com/llvm/llvm-project/pull/156822
>From a81c4068096b960de65c3517f18d2d31004afbce Mon Sep 17 00:00:00 2001 From: whyuuwang <whyuuw...@tencent.com> Date: Thu, 4 Sep 2025 15:52:57 +0800 Subject: [PATCH 1/4] deal this issues 155395 VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - add MMX/SSE/AVX PHADD/SUB & HADDPS/D intrinsics to be used in constexpr #155395 --- clang/lib/Headers/avx2intrin.h | 27 ++++----- clang/lib/Headers/avxintrin.h | 11 ++-- clang/lib/Headers/pmmintrin.h | 20 +++---- clang/lib/Headers/tmmintrin.h | 80 +++++++++++-------------- clang/test/CodeGen/X86/avx-builtins.c | 29 +++++++++ clang/test/CodeGen/X86/avx2-builtins.c | 63 +++++++++++++++++++ clang/test/CodeGen/X86/mmx-builtins.c | 48 +++++++++++++++ clang/test/CodeGen/X86/ssse3-builtins.c | 49 +++++++++++++++ 8 files changed, 250 insertions(+), 77 deletions(-) diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index 384faa35d246f..f8fb808f7f29c 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -854,10 +854,9 @@ _mm256_cmpgt_epi64(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the sums. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_hadd_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b); +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_hadd_epi16(__m256i __a, __m256i __b) { + return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b); } /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit @@ -886,7 +885,7 @@ _mm256_hadd_epi16(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [8 x i32] containing one of the source operands. /// \returns A 256-bit vector of [8 x i32] containing the sums. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hadd_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b); @@ -921,10 +920,9 @@ _mm256_hadd_epi32(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the sums. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_hadds_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b); +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_hadds_epi16(__m256i __a, __m256i __b) { + return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b); } /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit @@ -957,10 +955,9 @@ _mm256_hadds_epi16(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the differences. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_hsub_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b); +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_hsub_epi16(__m256i __a, __m256i __b) { + return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b); } /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit @@ -989,7 +986,7 @@ _mm256_hsub_epi16(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [8 x i32] containing one of the source operands. /// \returns A 256-bit vector of [8 x i32] containing the differences. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hsub_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b); @@ -1025,7 +1022,7 @@ _mm256_hsub_epi32(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the differences. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_hsubs_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b); diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h index 26096da949447..976710a64e80e 100644 --- a/clang/lib/Headers/avxintrin.h +++ b/clang/lib/Headers/avxintrin.h @@ -703,7 +703,7 @@ _mm256_xor_ps(__m256 __a, __m256 __b) /// elements of a vector of [4 x double]. /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of /// both operands. -static __inline __m256d __DEFAULT_FN_ATTRS +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_pd(__m256d __a, __m256d __b) { return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); @@ -726,9 +726,8 @@ _mm256_hadd_pd(__m256d __a, __m256d __b) /// index 2, 3, 6, 7 of a vector of [8 x float]. /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of /// both operands. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_hadd_ps(__m256 __a, __m256 __b) -{ +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm256_hadd_ps(__m256 __a, __m256 __b) { return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); } @@ -749,7 +748,7 @@ _mm256_hadd_ps(__m256 __a, __m256 __b) /// odd-indexed elements of a vector of [4 x double]. /// \returns A 256-bit vector of [4 x double] containing the horizontal /// differences of both operands. -static __inline __m256d __DEFAULT_FN_ATTRS +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_pd(__m256d __a, __m256d __b) { return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); @@ -772,7 +771,7 @@ _mm256_hsub_pd(__m256d __a, __m256d __b) /// elements with index 2, 3, 6, 7 of a vector of [8 x float]. /// \returns A 256-bit vector of [8 x float] containing the horizontal /// differences of both operands. -static __inline __m256 __DEFAULT_FN_ATTRS +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_ps(__m256 __a, __m256 __b) { return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b); diff --git a/clang/lib/Headers/pmmintrin.h b/clang/lib/Headers/pmmintrin.h index cd605df7fb52d..400b28bb877a1 100644 --- a/clang/lib/Headers/pmmintrin.h +++ b/clang/lib/Headers/pmmintrin.h @@ -89,9 +89,8 @@ _mm_addsub_ps(__m128 __a, __m128 __b) /// destination. /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of /// both operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_hadd_ps(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_hadd_ps(__m128 __a, __m128 __b) { return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b); } @@ -174,9 +173,8 @@ _mm_moveldup_ps(__m128 __a) /// A 128-bit vector of [2 x double] containing the right source operand. /// \returns A 128-bit vector of [2 x double] containing the alternating sums /// and differences of both operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_addsub_pd(__m128d __a, __m128d __b) -{ +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_addsub_pd(__m128d __a, __m128d __b) { return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b); } @@ -197,9 +195,8 @@ _mm_addsub_pd(__m128d __a, __m128d __b) /// destination. /// \returns A 128-bit vector of [2 x double] containing the horizontal sums of /// both operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_hadd_pd(__m128d __a, __m128d __b) -{ +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_hadd_pd(__m128d __a, __m128d __b) { return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b); } @@ -220,9 +217,8 @@ _mm_hadd_pd(__m128d __a, __m128d __b) /// the destination. /// \returns A 128-bit vector of [2 x double] containing the horizontal /// differences of both operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_hsub_pd(__m128d __a, __m128d __b) -{ +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_hsub_pd(__m128d __a, __m128d __b) { return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b); } diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h index f01c61afa8ea2..d79f7f6ea4091 100644 --- a/clang/lib/Headers/tmmintrin.h +++ b/clang/lib/Headers/tmmintrin.h @@ -204,10 +204,10 @@ _mm_abs_epi32(__m128i __a) { /// destination. /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of /// both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hadd_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_hadd_epi16(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_phaddw128( + (__v8hi)__a, (__v8hi)__b); } /// Horizontally adds the adjacent pairs of values contained in 2 packed @@ -227,10 +227,9 @@ _mm_hadd_epi16(__m128i __a, __m128i __b) /// destination. /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of /// both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hadd_epi32(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_hadd_epi32(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b); } /// Horizontally adds the adjacent pairs of values contained in 2 packed @@ -250,11 +249,10 @@ _mm_hadd_epi32(__m128i __a, __m128i __b) /// destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both /// operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_hadd_pi16(__m64 __a, __m64 __b) -{ - return __trunc64(__builtin_ia32_phaddw128( - (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR + _mm_hadd_pi16(__m64 __a, __m64 __b) { + return __trunc64(__builtin_ia32_phaddw128( + (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); } /// Horizontally adds the adjacent pairs of values contained in 2 packed @@ -274,7 +272,7 @@ _mm_hadd_pi16(__m64 __a, __m64 __b) /// destination. /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both /// operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_pi32(__m64 __a, __m64 __b) { return __trunc64(__builtin_ia32_phaddd128( @@ -301,10 +299,9 @@ _mm_hadd_pi32(__m64 __a, __m64 __b) /// destination. /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated /// sums of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hadds_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_hadds_epi16(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b); } /// Horizontally adds, with saturation, the adjacent pairs of values contained @@ -327,7 +324,7 @@ _mm_hadds_epi16(__m128i __a, __m128i __b) /// destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated /// sums of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadds_pi16(__m64 __a, __m64 __b) { return __trunc64(__builtin_ia32_phaddsw128( @@ -351,10 +348,9 @@ _mm_hadds_pi16(__m64 __a, __m64 __b) /// the destination. /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences /// of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hsub_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_hsub_epi16(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b); } /// Horizontally subtracts the adjacent pairs of values contained in 2 @@ -374,10 +370,9 @@ _mm_hsub_epi16(__m128i __a, __m128i __b) /// the destination. /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences /// of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hsub_epi32(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_hsub_epi32(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b); } /// Horizontally subtracts the adjacent pairs of values contained in 2 @@ -397,7 +392,7 @@ _mm_hsub_epi32(__m128i __a, __m128i __b) /// the destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences /// of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsub_pi16(__m64 __a, __m64 __b) { return __trunc64(__builtin_ia32_phsubw128( @@ -421,7 +416,7 @@ _mm_hsub_pi16(__m64 __a, __m64 __b) /// the destination. /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences /// of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsub_pi32(__m64 __a, __m64 __b) { return __trunc64(__builtin_ia32_phsubd128( @@ -448,10 +443,9 @@ _mm_hsub_pi32(__m64 __a, __m64 __b) /// the destination. /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated /// differences of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hsubs_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_hsubs_epi16(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b); } /// Horizontally subtracts, with saturation, the adjacent pairs of values @@ -474,7 +468,7 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b) /// the destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated /// differences of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsubs_pi16(__m64 __a, __m64 __b) { return __trunc64(__builtin_ia32_phsubsw128( @@ -509,10 +503,9 @@ _mm_hsubs_pi16(__m64 __a, __m64 __b) /// \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n /// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n /// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15) -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maddubs_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_maddubs_epi16(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b); } /// Multiplies corresponding pairs of packed 8-bit unsigned integer @@ -539,11 +532,10 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b) /// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n /// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n /// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_maddubs_pi16(__m64 __a, __m64 __b) -{ - return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a), - (__v16qi)__anyext128(__b))); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_maddubs_pi16(__m64 __a, __m64 __b) { + return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a), + (__v16qi)__anyext128(__b))); } /// Multiplies packed 16-bit signed integer values, truncates the 32-bit @@ -560,7 +552,7 @@ _mm_maddubs_pi16(__m64 __a, __m64 __b) /// A 128-bit vector of [8 x i16] containing one of the source operands. /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled /// products of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mulhrs_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b); diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c index 4a048744faa61..f381faebededf 100644 --- a/clang/test/CodeGen/X86/avx-builtins.c +++ b/clang/test/CodeGen/X86/avx-builtins.c @@ -1083,24 +1083,53 @@ __m256d test_mm256_hadd_pd(__m256d A, __m256d B) { // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_hadd_pd(A, B); } +constexpr bool test_mm256_hadd_epi32_constexpr() { + constexpr __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0); + constexpr __m256d b = _mm256_set_pd(5.0, 6.0, 7.0, 8.0); + constexpr __m256d result = _mm256_hadd_pd(a, b); + return match_m256d(result,1.0+2.0,3.0+4.0,5.0+6.0,7.0+8.0); +} +TEST_CONSTEXPR(test_mm256_hadd_epi32_constexpr()) __m256 test_mm256_hadd_ps(__m256 A, __m256 B) { // CHECK-LABEL: test_mm256_hadd_ps // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_hadd_ps(A, B); } +constexpr bool test_mm256_hadd_ps_constexpr() { + constexpr __m256 a = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); + constexpr __m256 b = _mm256_set_ps(9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f); + constexpr __m256 result = _mm256_hadd_ps(a, b); + return match_m256(result,1.0f+2.0f,3.0f+4.0f,5.0f+6.0f,7.0f+8.0f, + 9.0f+10.0f,11.0f+12.0f,13.0f+14.0f,15.0f+16.0f); +} +TEST_CONSTEXPR(test_mm256_hadd_ps_constexpr()) __m256d test_mm256_hsub_pd(__m256d A, __m256d B) { // CHECK-LABEL: test_mm256_hsub_pd // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_hsub_pd(A, B); } +constexpr bool test_mm256_hsub_pd_constexpr() { + constexpr __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0); + constexpr __m256d b = _mm256_set_pd(5.0, 6.0, 7.0, 8.0); + constexpr __m256d result = _mm256_hsub_pd(a, b); + return match_m256d(result,1.0-2.0,3.0-4.0,5.0-6.0,7.0-8.0); +} +TEST_CONSTEXPR(test_mm256_hsub_pd_constexpr()) __m256 test_mm256_hsub_ps(__m256 A, __m256 B) { // CHECK-LABEL: test_mm256_hsub_ps // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_hsub_ps(A, B); } +constexpr bool test_mm256_hsub_ps_constexpr() { + constexpr __m256 a = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); + constexpr __m256 b = _mm256_set_ps(9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f); + constexpr __m256 result = _mm256_hsub_ps(a, b); + return match_m256(result,1.0f-2.0f,3.0f-4.0f,5.0f-6.0f,7.0f-8.0f, + 9.0f-10.0f,11.0f-12.0f,13.0f-14.0f,15.0f-16.0f); +} __m256i test_mm256_insert_epi8(__m256i x, char b) { // CHECK-LABEL: test_mm256_insert_epi8 diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index a39ce513837ea..02845b9417a1f 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -462,17 +462,48 @@ __m256i test_mm256_hadd_epi16(__m256i a, __m256i b) { return _mm256_hadd_epi16(a, b); } +constexpr bool test_mm256_hadd_epi16_constexpr() { + constexpr __m256i a = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, + 8,9,10,11,12,13,14,15,16); + constexpr __m256i b = _mm256_setr_epi16(17,18,19,20,21,22,23, + 24,25,26,27,28,29,30,31,32); + + constexpr __m256i result = _mm256_hadd_epi16(a, b); + return match_v16si(result,1+2,3+4,5+6,7+8,9+10,11+12,13+14,15+16,17+18,19+20,21+22,23+24,25+26,27+28,29+30,31+32); +} +TEST_CONSTEXPR(test_mm256_hadd_epi16_constexpr()) + __m256i test_mm256_hadd_epi32(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_hadd_epi32 // CHECK: call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}) return _mm256_hadd_epi32(a, b); } +constexpr bool test_mm256_hadd_epi32_constexpr() { + constexpr __m256i a = _mm256_setr_epi32(10, 20, 30, 40, 50, 60, 70, 80); + constexpr __m256i b = _mm256_setr_epi32(5, 15, 25, 35, 45, 55, 65, 75); + + constexpr __m256i result = _mm256_hadd_epi32(a, b); + return match_v8si(result,10+20,30+40,50+60,70+80,5+15,25+35, 45+55,65+75); +} +TEST_CONSTEXPR(test_mm256_hadd_epi32_constexpr()) + __m256i test_mm256_hadds_epi16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_hadds_epi16 // CHECK:call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_hadds_epi16(a, b); } +constexpr bool test_mm256_hadds_epi16_constexpr() { + constexpr __m256i a = _mm256_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, + 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); + constexpr __m256i b = _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1); + constexpr __m256i result = _mm256_hadds_epi16(a, b); + + return match_v16si(result, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, + 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); +} +TEST_CONSTEXPR(test_mm256_hadds_epi16_constexpr()) __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_hsub_epi16 @@ -480,18 +511,50 @@ __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) { return _mm256_hsub_epi16(a, b); } +constexpr bool test_mm256_hsub_epi16_constexpr() { + constexpr __m256i a = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, + 8,9,10,11,12,13,14,15,16); + constexpr __m256i b = _mm256_setr_epi16(17,18,19,20,21,22,23, + 24,25,26,27,28,29,30,31,32); + + constexpr __m256i result = _mm256_hsub_epi16(a, b); + return match_v16si(result,1-2,3-4,5-6,7-8,9-10,11-12,13-14,15-16,17-18,19-20,21-22,23-24,25-26,27-28,29-30,31-32); +} +TEST_CONSTEXPR(test_mm256_hsub_epi16_constexpr()) + __m256i test_mm256_hsub_epi32(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_hsub_epi32 // CHECK: call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}) return _mm256_hsub_epi32(a, b); } +constexpr bool test_mm256_hsub_epi32_constexpr() { + constexpr __m256i a = _mm256_setr_epi32(10, 20, 30, 40, 50, 60, 70, 80); + constexpr __m256i b = _mm256_setr_epi32(5, 15, 25, 35, 45, 55, 65, 75); + + constexpr __m256i result = _mm256_hsub_epi32(a, b); + return match_v8si(result,10-20,30-40,50-60,70-80,5-15,25-35, 45-55,65-75); +} +TEST_CONSTEXPR(test_mm256_hsub_epi32_constexpr()) + __m256i test_mm256_hsubs_epi16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_hsubs_epi16 // CHECK:call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_hsubs_epi16(a, b); } +constexpr bool test_mm256_hsubs_epi16_constexpr() { + constexpr __m256i a = _mm256_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, + 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); + constexpr __m256i b = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + constexpr __m256i result3 = _mm256_hsubs_epi16(a, b); + + return match_v16si(result3, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, + 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); +} +TEST_CONSTEXPR(test_mm256_hsubs_epi16_constexpr()) + + __m128i test_mm_i32gather_epi32(int const *b, __m128i c) { // CHECK-LABEL: test_mm_i32gather_epi32 // CHECK: call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %{{.*}}, ptr %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i8 2) diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c index 7bd2475399bf9..8da0e8c814879 100644 --- a/clang/test/CodeGen/X86/mmx-builtins.c +++ b/clang/test/CodeGen/X86/mmx-builtins.c @@ -309,36 +309,84 @@ __m64 test_mm_hadd_pi16(__m64 a, __m64 b) { // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128( return _mm_hadd_pi16(a, b); } +constexpr bool test_mm_hadd_pi16_constexpr() { + constexpr __m64 a = _mm_setr_pi16(1, 2, 3, 4); + constexpr __m64 b = _mm_setr_pi16(5,6,7,8); + + constexpr __m64 result = _mm_hadd_pi16(a, b); + return match_v4si(result,1+2,3+4,5+6,7+8); +} +TEST_CONSTEXPR(test_mm_hadd_pi16_constexpr()) __m64 test_mm_hadd_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hadd_pi32 // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128( return _mm_hadd_pi32(a, b); } +constexpr bool test_mm_hadd_pi32_constexpr() { + constexpr __m64 a = _mm_setr_pi32(1, 2); + constexpr __m64 b = _mm_setr_pi32(3, 4); + + constexpr __m64 result = _mm_hadd_pi32(a, b); + return match_v2si(result,1+2,3+4); +} +TEST_CONSTEXPR(test_mm_hadd_pi32_constexpr()) __m64 test_mm_hadds_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hadds_pi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128( return _mm_hadds_pi16(a, b); } +constexpr bool test_mm_hadds_pi16_constexpr() { + constexpr __m64 a = _mm_setr_pi16(32767, 32767, 32767, 32767); + constexpr __m64 b = _mm_setr_pi16(1,1,1,1); + + constexpr __m64 result = _mm_hadds_pi16(a, b); + return match_v4si(result,32767, 32767, 32767, 32767); +} +TEST_CONSTEXPR(test_mm_hadds_pi16_constexpr()) __m64 test_mm_hsub_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hsub_pi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128( return _mm_hsub_pi16(a, b); } +constexpr bool test_mm_hsub_pi16_constexpr() { + constexpr __m64 a = _mm_setr_pi16(1, 2, 3, 4); + constexpr __m64 b = _mm_setr_pi16(5,6,7,8); + + constexpr __m64 result = _mm_hsub_pi16(a, b); + return match_v4si(result,1-2,3-4,5-6,7-8); +} +TEST_CONSTEXPR(test_mm_hsub_pi16_constexpr()) __m64 test_mm_hsub_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hsub_pi32 // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128( return _mm_hsub_pi32(a, b); } +constexpr bool test_mm_hsub_pi32_constexpr() { + constexpr __m64 a = _mm_setr_pi32(1, 2); + constexpr __m64 b = _mm_setr_pi32(3, 4); + + constexpr __m64 result = _mm_hsub_pi32(a, b); + return match_v2si(result,1-2,3-4); +} +TEST_CONSTEXPR(test_mm_hsub_pi32_constexpr()) __m64 test_mm_hsubs_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hsubs_pi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128( return _mm_hsubs_pi16(a, b); } +constexpr bool test_mm_hsubs_pi16_constexpr() { + constexpr __m64 a = _mm_setr_pi16(32767, 32767, 32767, 32767); + constexpr __m64 b = _mm_setr_pi16(-1,-1,-1,-1); + + constexpr __m64 result = _mm_hsubs_pi16(a, b); + return match_v4si(result,32767, 32767, 32767, 32767); +} +TEST_CONSTEXPR(test_mm_hsubs_pi16_constexpr()) __m64 test_mm_insert_pi16(__m64 a, int d) { // CHECK-LABEL: test_mm_insert_pi16 diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c index 56ff73f08ab32..bd0ef43278217 100644 --- a/clang/test/CodeGen/X86/ssse3-builtins.c +++ b/clang/test/CodeGen/X86/ssse3-builtins.c @@ -60,36 +60,85 @@ __m128i test_mm_hadd_epi16(__m128i a, __m128i b) { // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_hadd_epi16(a, b); } +constexpr bool test_mm_hadd_epi16_constexpr() { + constexpr __m128i a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + constexpr __m128i b = _mm_setr_epi16(17,18,19,20,21,22,23,24); + + constexpr __m128i result = _mm_hadd_epi16(a, b); + return match_v8si(result,1+2,3+4,5+6,7+8,17+18,19+20,21+22,23+24); +} +TEST_CONSTEXPR(test_mm_hadd_epi16_constexpr()) __m128i test_mm_hadd_epi32(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_hadd_epi32 // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) return _mm_hadd_epi32(a, b); } +constexpr bool test_mm_hadd_epi32_constexpr() { + constexpr __m128i a = _mm_setr_epi32(1, 2, 3, 4); + constexpr __m128i b = _mm_setr_epi32(5,6,7,8); + + constexpr __m128i result = _mm_hadd_epi32(a, b); + return match_v4si(result,1+2,3+4,5+6,7+8); +} +TEST_CONSTEXPR(test_mm_hadd_epi32_constexpr()) __m128i test_mm_hadds_epi16(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_hadds_epi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_hadds_epi16(a, b); } +constexpr bool test_mm_hadds_epi16_constexpr() { + constexpr __m128i a = _mm_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); + constexpr __m128i b = _mm_setr_epi16(1, 1, 1, 1, 1, 1, 1, 1); + constexpr __m128i result = _mm_hadds_epi16(a, b); + + return match_v8si(result, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); +} +TEST_CONSTEXPR(test_mm_hadds_epi16_constexpr()) + __m128i test_mm_hsub_epi16(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_hsub_epi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_hsub_epi16(a, b); } +constexpr bool test_mm_hsub_epi16_constexpr() { + constexpr __m128i a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + constexpr __m128i b = _mm_setr_epi16(9,10,11,12,13,14,15,16); + + constexpr __m128i result = _mm_hsub_epi16(a, b); + return match_v8si(result,1-2,3-4,5-6,7-8,9-10,11-12,13-14,15-16); +} +TEST_CONSTEXPR(test_mm_hsub_epi16_constexpr()) __m128i test_mm_hsub_epi32(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_hsub_epi32 // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) return _mm_hsub_epi32(a, b); } +constexpr bool test_mm_hsub_epi32_constexpr() { + constexpr __m128i a = _mm_setr_epi32(1, 2, 3, 4); + constexpr __m128i b = _mm_setr_epi32(5,6,7,8); + + constexpr __m128i result = _mm_hsub_epi32(a, b); + return match_v4si(result,1-2,3-4,5-6,7-8); +} +TEST_CONSTEXPR(test_mm_hsub_epi32_constexpr()) __m128i test_mm_hsubs_epi16(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_hsubs_epi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_hsubs_epi16(a, b); } +constexpr bool test_mm_hsubs_epi16_constexpr() { + constexpr __m128i a = _mm_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); + constexpr __m128i b = _mm_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1); + constexpr __m128i result3 = _mm_hsubs_epi16(a, b); + + return match_v8si(result3, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); +} +TEST_CONSTEXPR(test_mm_hsubs_epi16_constexpr()) __m128i test_mm_maddubs_epi16(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_maddubs_epi16 >From 2fadf3fd261935e25adff5b26ad8ee0734746a26 Mon Sep 17 00:00:00 2001 From: whyuuwang <whyuuw...@tencent.com> Date: Thu, 4 Sep 2025 15:55:44 +0800 Subject: [PATCH 2/4] deal issues 15595 [Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - add MMX/SSE/AVX PHADD/SUB & HADDPS/D intrinsics to be used in constexpr #155395 --- clang/lib/Headers/avx2intrin.h | 15 ++++----- clang/lib/Headers/avxintrin.h | 15 ++++----- clang/lib/Headers/pmmintrin.h | 4 +-- clang/lib/Headers/tmmintrin.h | 57 +++++++++++++++------------------- 4 files changed, 39 insertions(+), 52 deletions(-) diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index f8fb808f7f29c..c39f94c7fc16b 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -886,9 +886,8 @@ _mm256_hadd_epi16(__m256i __a, __m256i __b) { /// A 256-bit vector of [8 x i32] containing one of the source operands. /// \returns A 256-bit vector of [8 x i32] containing the sums. static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_hadd_epi32(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b); +_mm256_hadd_epi32(__m256i __a, __m256i __b) { + return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b); } /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit @@ -987,9 +986,8 @@ _mm256_hsub_epi16(__m256i __a, __m256i __b) { /// A 256-bit vector of [8 x i32] containing one of the source operands. /// \returns A 256-bit vector of [8 x i32] containing the differences. static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_hsub_epi32(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b); +_mm256_hsub_epi32(__m256i __a, __m256i __b) { + return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b); } /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit @@ -1023,9 +1021,8 @@ _mm256_hsub_epi32(__m256i __a, __m256i __b) /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the differences. static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_hsubs_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b); +_mm256_hsubs_epi16(__m256i __a, __m256i __b) { + return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b); } /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h index 976710a64e80e..48d79063f9b61 100644 --- a/clang/lib/Headers/avxintrin.h +++ b/clang/lib/Headers/avxintrin.h @@ -704,8 +704,7 @@ _mm256_xor_ps(__m256 __a, __m256 __b) /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of /// both operands. static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR -_mm256_hadd_pd(__m256d __a, __m256d __b) -{ +_mm256_hadd_pd(__m256d __a, __m256d __b) { return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); } @@ -726,8 +725,8 @@ _mm256_hadd_pd(__m256d __a, __m256d __b) /// index 2, 3, 6, 7 of a vector of [8 x float]. /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of /// both operands. -static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR -_mm256_hadd_ps(__m256 __a, __m256 __b) { +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_ps(__m256 __a, + __m256 __b) { return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); } @@ -749,8 +748,7 @@ _mm256_hadd_ps(__m256 __a, __m256 __b) { /// \returns A 256-bit vector of [4 x double] containing the horizontal /// differences of both operands. static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR -_mm256_hsub_pd(__m256d __a, __m256d __b) -{ +_mm256_hsub_pd(__m256d __a, __m256d __b) { return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); } @@ -771,9 +769,8 @@ _mm256_hsub_pd(__m256d __a, __m256d __b) /// elements with index 2, 3, 6, 7 of a vector of [8 x float]. /// \returns A 256-bit vector of [8 x float] containing the horizontal /// differences of both operands. -static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR -_mm256_hsub_ps(__m256 __a, __m256 __b) -{ +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_ps(__m256 __a, + __m256 __b) { return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b); } diff --git a/clang/lib/Headers/pmmintrin.h b/clang/lib/Headers/pmmintrin.h index 400b28bb877a1..67f2a7ffd1f56 100644 --- a/clang/lib/Headers/pmmintrin.h +++ b/clang/lib/Headers/pmmintrin.h @@ -89,8 +89,8 @@ _mm_addsub_ps(__m128 __a, __m128 __b) /// destination. /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of /// both operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR -_mm_hadd_ps(__m128 __a, __m128 __b) { +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_ps(__m128 __a, + __m128 __b) { return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b); } diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h index d79f7f6ea4091..b408c6a3404ec 100644 --- a/clang/lib/Headers/tmmintrin.h +++ b/clang/lib/Headers/tmmintrin.h @@ -206,8 +206,7 @@ _mm_abs_epi32(__m128i __a) { /// both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_epi16(__m128i __a, __m128i __b) { - return (__m128i)__builtin_ia32_phaddw128( - (__v8hi)__a, (__v8hi)__b); + return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b); } /// Horizontally adds the adjacent pairs of values contained in 2 packed @@ -249,8 +248,8 @@ _mm_hadd_epi32(__m128i __a, __m128i __b) { /// destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both /// operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR - _mm_hadd_pi16(__m64 __a, __m64 __b) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_pi16(__m64 __a, + __m64 __b) { return __trunc64(__builtin_ia32_phaddw128( (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); } @@ -272,11 +271,10 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR /// destination. /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both /// operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR -_mm_hadd_pi32(__m64 __a, __m64 __b) -{ - return __trunc64(__builtin_ia32_phaddd128( - (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){})); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_pi32(__m64 __a, + __m64 __b) { + return __trunc64(__builtin_ia32_phaddd128( + (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){})); } /// Horizontally adds, with saturation, the adjacent pairs of values contained @@ -324,11 +322,10 @@ _mm_hadds_epi16(__m128i __a, __m128i __b) { /// destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated /// sums of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR -_mm_hadds_pi16(__m64 __a, __m64 __b) -{ - return __trunc64(__builtin_ia32_phaddsw128( - (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadds_pi16(__m64 __a, + __m64 __b) { + return __trunc64(__builtin_ia32_phaddsw128( + (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); } /// Horizontally subtracts the adjacent pairs of values contained in 2 @@ -392,11 +389,10 @@ _mm_hsub_epi32(__m128i __a, __m128i __b) { /// the destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences /// of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR -_mm_hsub_pi16(__m64 __a, __m64 __b) -{ - return __trunc64(__builtin_ia32_phsubw128( - (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsub_pi16(__m64 __a, + __m64 __b) { + return __trunc64(__builtin_ia32_phsubw128( + (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); } /// Horizontally subtracts the adjacent pairs of values contained in 2 @@ -416,11 +412,10 @@ _mm_hsub_pi16(__m64 __a, __m64 __b) /// the destination. /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences /// of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR -_mm_hsub_pi32(__m64 __a, __m64 __b) -{ - return __trunc64(__builtin_ia32_phsubd128( - (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){})); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsub_pi32(__m64 __a, + __m64 __b) { + return __trunc64(__builtin_ia32_phsubd128( + (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){})); } /// Horizontally subtracts, with saturation, the adjacent pairs of values @@ -468,11 +463,10 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b) { /// the destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated /// differences of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR -_mm_hsubs_pi16(__m64 __a, __m64 __b) -{ - return __trunc64(__builtin_ia32_phsubsw128( - (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsubs_pi16(__m64 __a, + __m64 __b) { + return __trunc64(__builtin_ia32_phsubsw128( + (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); } /// Multiplies corresponding pairs of packed 8-bit unsigned integer @@ -553,9 +547,8 @@ _mm_maddubs_pi16(__m64 __a, __m64 __b) { /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled /// products of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR -_mm_mulhrs_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b); +_mm_mulhrs_epi16(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b); } /// Multiplies packed 16-bit signed integer values, truncates the 32-bit >From ed4a09fb51ab347b4778b81d1f8c511d31d106a7 Mon Sep 17 00:00:00 2001 From: whyuuwang <whyuuw...@tencent.com> Date: Thu, 11 Sep 2025 13:48:13 +0800 Subject: [PATCH 3/4] constexpr deal --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 243 +++++++++++++++++---- clang/lib/AST/ExprConstant.cpp | 266 +++++++++++++++++------ 2 files changed, 407 insertions(+), 102 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 8c2b71160f7f3..f6027c78935c3 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -8,6 +8,7 @@ #include "../ExprConstShared.h" #include "Boolean.h" #include "EvalEmitter.h" +#include "Floating.h" #include "Interp.h" #include "InterpBuiltinBitCast.h" #include "PrimType.h" @@ -19,6 +20,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SipHash.h" +#include <cassert> namespace clang { namespace interp { @@ -2736,6 +2738,141 @@ static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC, return true; } +static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC, + const InterpFrame *Frame, + const CallExpr *Call, + uint32_t BuiltinID) { + assert(Call->getArg(0)->getType()->isVectorType() && + Call->getArg(1)->getType()->isVectorType()); + const Pointer &RHS = S.Stk.pop<Pointer>(); + const Pointer &LHS = S.Stk.pop<Pointer>(); + const Pointer &Dst = S.Stk.peek<Pointer>(); + + const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>(); + PrimType ElemT = *S.getContext().classify(VT->getElementType()); + unsigned SourceLen = VT->getNumElements(); + assert(SourceLen % 2 == 0 && + Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() == SourceLen); + PrimType DstElemT = *S.getContext().classify( + Call->getType()->castAs<VectorType>()->getElementType()); + unsigned DstElem = 0; + + bool IsAdd = (BuiltinID == clang::X86::BI__builtin_ia32_phaddw128 || + BuiltinID == clang::X86::BI__builtin_ia32_phaddw256 || + BuiltinID == clang::X86::BI__builtin_ia32_phaddd128 || + BuiltinID == clang::X86::BI__builtin_ia32_phaddd256 || + BuiltinID == clang::X86::BI__builtin_ia32_phaddsw128 || + BuiltinID == clang::X86::BI__builtin_ia32_phaddsw256); + + bool IsSaturating = (BuiltinID == clang::X86::BI__builtin_ia32_phaddsw128 || + BuiltinID == clang::X86::BI__builtin_ia32_phaddsw256 || + BuiltinID == clang::X86::BI__builtin_ia32_phsubsw128 || + BuiltinID == clang::X86::BI__builtin_ia32_phsubsw256); + + for (unsigned I = 0; I != SourceLen; I += 2) { + APSInt Elem1; + APSInt Elem2; + INT_TYPE_SWITCH_NO_BOOL(ElemT, { + Elem1 = LHS.elem<T>(I).toAPSInt(); + Elem2 = LHS.elem<T>(I+1).toAPSInt(); + }); + APSInt Result; + if (IsAdd) { + if (IsSaturating) { + Result = APSInt(Elem1.sadd_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned()); + }else{ + Result = APSInt(Elem1 + Elem2, /*IsUnsigned=*/Elem1.isUnsigned()); + } + }else{ + if (IsSaturating) { + Result = + APSInt(Elem1.ssub_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned()); + } else { + Result = APSInt(Elem1 - Elem2, /*IsUnsigned=*/Elem1.isUnsigned()); + } + } + INT_TYPE_SWITCH_NO_BOOL(DstElemT, + { Dst.elem<T>(DstElem) = static_cast<T>(Result); }); + ++DstElem; + } + for (unsigned I = 0; I != SourceLen; I += 2) { + APSInt Elem1; + APSInt Elem2; + INT_TYPE_SWITCH_NO_BOOL(ElemT, { + Elem1 = RHS.elem<T>(I).toAPSInt(); + Elem2 = RHS.elem<T>(I + 1).toAPSInt(); + }); + APSInt Result; + if (IsAdd) { + if (IsSaturating) { + Result = + APSInt(Elem1.sadd_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned()); + } else { + Result = APSInt(Elem1 + Elem2, /*IsUnsigned=*/Elem1.isUnsigned()); + } + } else { + if (IsSaturating) { + Result = APSInt(Elem1.ssub_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned()); + } else { + Result = APSInt(Elem1 - Elem2, /*IsUnsigned=*/Elem1.isUnsigned()); + } + } + INT_TYPE_SWITCH_NO_BOOL(DstElemT, + { Dst.elem<T>(DstElem) = static_cast<T>(Result); }); + ++DstElem; + } + Dst.initializeAllElements(); + return true; +} + +static bool interp_builtin_floatph_add_sub(InterpState &S, CodePtr OpPC, + const InterpFrame *Frame, + const CallExpr *Call, + uint32_t BuiltinID) { + assert(Call->getArg(0)->getType()->isVectorType() && + Call->getArg(1)->getType()->isVectorType()); + const Pointer &RHS = S.Stk.pop<Pointer>(); + const Pointer &LHS = S.Stk.pop<Pointer>(); + const Pointer &Dst = S.Stk.peek<Pointer>(); + + FPOptions FPO = Call->getFPFeaturesInEffect(S.Ctx.getLangOpts()); + llvm::RoundingMode RM = getRoundingMode(FPO); + const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>(); + unsigned SourceLen = VT->getNumElements(); + assert(SourceLen % 2 == 0 && + Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() == + SourceLen); + unsigned DstElem = 0; + bool IsAdd = (BuiltinID == clang::X86::BI__builtin_ia32_haddpd || + BuiltinID == clang::X86::BI__builtin_ia32_haddpd256 || + BuiltinID == clang::X86::BI__builtin_ia32_haddps || + BuiltinID == clang::X86::BI__builtin_ia32_haddps256); + using T = Floating; + for (unsigned I = 0; I != SourceLen; I += 2) { + APFloat Elem1 = LHS.elem<T>(I).getAPFloat(); + APFloat Elem2 = LHS.elem<T>(I + 1).getAPFloat(); + + if (IsAdd) { + Elem1.add(Elem2, RM); + } else { + Elem1.subtract(Elem2, RM); + } + Dst.elem<T>(DstElem++) = Elem1; + } + for (unsigned I = 0; I != SourceLen; I += 2) { + APFloat Elem1 = RHS.elem<T>(I).getAPFloat(); + APFloat Elem2 = RHS.elem<T>(I + 1).getAPFloat(); + if (IsAdd) { + Elem1.add(Elem2, RM); + } else { + Elem1.subtract(Elem2, RM); + } + Dst.elem<T>(DstElem++) = Elem1; + } + Dst.initializeAllElements(); + return true; +} + static bool interp__builtin_elementwise_fma(InterpState &S, CodePtr OpPC, const CallExpr *Call) { assert(Call->getNumArgs() == 3); @@ -3356,49 +3493,73 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case Builtin::BI__builtin_elementwise_min: return interp__builtin_elementwise_maxmin(S, OpPC, Call, BuiltinID); - case clang::X86::BI__builtin_ia32_pmuldq128: - case clang::X86::BI__builtin_ia32_pmuldq256: - case clang::X86::BI__builtin_ia32_pmuldq512: - case clang::X86::BI__builtin_ia32_pmuludq128: - case clang::X86::BI__builtin_ia32_pmuludq256: - case clang::X86::BI__builtin_ia32_pmuludq512: - return interp__builtin_ia32_pmul(S, OpPC, Call, BuiltinID); - - case Builtin::BI__builtin_elementwise_fma: - return interp__builtin_elementwise_fma(S, OpPC, Call); - - case X86::BI__builtin_ia32_selectb_128: - case X86::BI__builtin_ia32_selectb_256: - case X86::BI__builtin_ia32_selectb_512: - case X86::BI__builtin_ia32_selectw_128: - case X86::BI__builtin_ia32_selectw_256: - case X86::BI__builtin_ia32_selectw_512: - case X86::BI__builtin_ia32_selectd_128: - case X86::BI__builtin_ia32_selectd_256: - case X86::BI__builtin_ia32_selectd_512: - case X86::BI__builtin_ia32_selectq_128: - case X86::BI__builtin_ia32_selectq_256: - case X86::BI__builtin_ia32_selectq_512: - case X86::BI__builtin_ia32_selectph_128: - case X86::BI__builtin_ia32_selectph_256: - case X86::BI__builtin_ia32_selectph_512: - case X86::BI__builtin_ia32_selectpbf_128: - case X86::BI__builtin_ia32_selectpbf_256: - case X86::BI__builtin_ia32_selectpbf_512: - case X86::BI__builtin_ia32_selectps_128: - case X86::BI__builtin_ia32_selectps_256: - case X86::BI__builtin_ia32_selectps_512: - case X86::BI__builtin_ia32_selectpd_128: - case X86::BI__builtin_ia32_selectpd_256: - case X86::BI__builtin_ia32_selectpd_512: - return interp__builtin_select(S, OpPC, Call); + case clang::X86::BI__builtin_ia32_phaddw128: + case clang::X86::BI__builtin_ia32_phaddw256: + case clang::X86::BI__builtin_ia32_phaddd128: + case clang::X86::BI__builtin_ia32_phaddd256: + case clang::X86::BI__builtin_ia32_phaddsw128: + case clang::X86::BI__builtin_ia32_phaddsw256: + case clang::X86::BI__builtin_ia32_phsubw128: + case clang::X86::BI__builtin_ia32_phsubw256: + case clang::X86::BI__builtin_ia32_phsubd128: + case clang::X86::BI__builtin_ia32_phsubd256: + case clang::X86::BI__builtin_ia32_phsubsw128: + case clang::X86::BI__builtin_ia32_phsubsw256: + + return interp_builtin_ia32ph_add_sub(S, OpPC, Frame, Call, BuiltinID); + case clang::X86::BI__builtin_ia32_haddpd: + case clang::X86::BI__builtin_ia32_haddpd256: + case clang::X86::BI__builtin_ia32_haddps: + case clang::X86::BI__builtin_ia32_haddps256: + case clang::X86::BI__builtin_ia32_hsubpd: + case clang::X86::BI__builtin_ia32_hsubpd256: + case clang::X86::BI__builtin_ia32_hsubps: + case clang::X86::BI__builtin_ia32_hsubps256: + return interp_builtin_floatph_add_sub(S, OpPC, Frame, Call, BuiltinID); - default: - S.FFDiag(S.Current->getLocation(OpPC), - diag::note_invalid_subexpr_in_const_expr) - << S.Current->getRange(OpPC); + case clang::X86::BI__builtin_ia32_pmuldq128: + case clang::X86::BI__builtin_ia32_pmuldq256: + case clang::X86::BI__builtin_ia32_pmuldq512: + case clang::X86::BI__builtin_ia32_pmuludq128: + case clang::X86::BI__builtin_ia32_pmuludq256: + case clang::X86::BI__builtin_ia32_pmuludq512: + return interp__builtin_ia32_pmul(S, OpPC, Call, BuiltinID); + + case Builtin::BI__builtin_elementwise_fma: + return interp__builtin_elementwise_fma(S, OpPC, Call); + + case X86::BI__builtin_ia32_selectb_128: + case X86::BI__builtin_ia32_selectb_256: + case X86::BI__builtin_ia32_selectb_512: + case X86::BI__builtin_ia32_selectw_128: + case X86::BI__builtin_ia32_selectw_256: + case X86::BI__builtin_ia32_selectw_512: + case X86::BI__builtin_ia32_selectd_128: + case X86::BI__builtin_ia32_selectd_256: + case X86::BI__builtin_ia32_selectd_512: + case X86::BI__builtin_ia32_selectq_128: + case X86::BI__builtin_ia32_selectq_256: + case X86::BI__builtin_ia32_selectq_512: + case X86::BI__builtin_ia32_selectph_128: + case X86::BI__builtin_ia32_selectph_256: + case X86::BI__builtin_ia32_selectph_512: + case X86::BI__builtin_ia32_selectpbf_128: + case X86::BI__builtin_ia32_selectpbf_256: + case X86::BI__builtin_ia32_selectpbf_512: + case X86::BI__builtin_ia32_selectps_128: + case X86::BI__builtin_ia32_selectps_256: + case X86::BI__builtin_ia32_selectps_512: + case X86::BI__builtin_ia32_selectpd_128: + case X86::BI__builtin_ia32_selectpd_256: + case X86::BI__builtin_ia32_selectpd_512: + return interp__builtin_select(S, OpPC, Call); - return false; + default: + S.FFDiag(S.Current->getLocation(OpPC), + diag::note_invalid_subexpr_in_const_expr) + << S.Current->getRange(OpPC); + + return false; } llvm_unreachable("Unhandled builtin ID"); diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 66362d44976c4..774a3adf1a7ca 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -55,6 +55,7 @@ #include "clang/Basic/TargetBuiltins.h" #include "clang/Basic/TargetInfo.h" #include "llvm/ADT/APFixedPoint.h" +#include "llvm/ADT/APInt.h" #include "llvm/ADT/Sequence.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/StringExtras.h" @@ -12105,6 +12106,145 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { } return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + + case clang::X86::BI__builtin_ia32_phaddw128: + case clang::X86::BI__builtin_ia32_phaddw256: + case clang::X86::BI__builtin_ia32_phaddd128: + case clang::X86::BI__builtin_ia32_phaddd256: + case clang::X86::BI__builtin_ia32_phaddsw128: + case clang::X86::BI__builtin_ia32_phaddsw256: + + case clang::X86::BI__builtin_ia32_phsubw128: + case clang::X86::BI__builtin_ia32_phsubw256: + case clang::X86::BI__builtin_ia32_phsubd128: + case clang::X86::BI__builtin_ia32_phsubd256: + case clang::X86::BI__builtin_ia32_phsubsw128: + case clang::X86::BI__builtin_ia32_phsubsw256:{ + APValue SourceLHS, SourceRHS; + if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) || + !EvaluateAsRValue(Info, E->getArg(1), SourceRHS)) + return false; + QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType(); + bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType(); + + unsigned SourceLen = SourceLHS.getVectorLength(); + SmallVector<APValue, 4> ResultElements; + ResultElements.reserve(SourceLen); + for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) { + APSInt LHSA = SourceLHS.getVectorElt(EltNum).getInt(); + APSInt LHSB = SourceLHS.getVectorElt(EltNum + 1).getInt(); + + switch (E->getBuiltinCallee()) { + case clang::X86::BI__builtin_ia32_phaddw128: + case clang::X86::BI__builtin_ia32_phaddw256: + case clang::X86::BI__builtin_ia32_phaddd128: + case clang::X86::BI__builtin_ia32_phaddd256: + ResultElements.push_back( + APValue(APSInt(LHSA+LHSB, DestUnsigned))); + break; + case clang::X86::BI__builtin_ia32_phaddsw128: + case clang::X86::BI__builtin_ia32_phaddsw256: + ResultElements.push_back(APValue(APSInt( + LHSA.isSigned() ? LHSA.sadd_sat(LHSB) : LHSA.uadd_sat(LHSB), + DestUnsigned))); + break; + case clang::X86::BI__builtin_ia32_phsubw128: + case clang::X86::BI__builtin_ia32_phsubw256: + case clang::X86::BI__builtin_ia32_phsubd128: + case clang::X86::BI__builtin_ia32_phsubd256: + ResultElements.push_back(APValue(APSInt(LHSA - LHSB, DestUnsigned))); + break; + case clang::X86::BI__builtin_ia32_phsubsw128: + case clang::X86::BI__builtin_ia32_phsubsw256: + ResultElements.push_back(APValue(APSInt( + LHSA.isSigned() ? LHSA.ssub_sat(LHSB) : LHSA.usub_sat(LHSB), + DestUnsigned))); + break; + } + } + for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) { + APSInt RHSA = SourceRHS.getVectorElt(EltNum).getInt(); + APSInt RHSB = SourceRHS.getVectorElt(EltNum + 1).getInt(); + + switch (E->getBuiltinCallee()) { + case clang::X86::BI__builtin_ia32_phaddw128: + case clang::X86::BI__builtin_ia32_phaddw256: + case clang::X86::BI__builtin_ia32_phaddd128: + case clang::X86::BI__builtin_ia32_phaddd256: + ResultElements.push_back(APValue(APSInt(RHSA + RHSB, DestUnsigned))); + break; + case clang::X86::BI__builtin_ia32_phaddsw128: + case clang::X86::BI__builtin_ia32_phaddsw256: + ResultElements.push_back(APValue( + APSInt(RHSA.isSigned() ? RHSA.sadd_sat(RHSB) : RHSA.uadd_sat(RHSB), + DestUnsigned))); + break; + case clang::X86::BI__builtin_ia32_phsubw128: + case clang::X86::BI__builtin_ia32_phsubw256: + case clang::X86::BI__builtin_ia32_phsubd128: + case clang::X86::BI__builtin_ia32_phsubd256: + ResultElements.push_back(APValue(APSInt(RHSA - RHSB, DestUnsigned))); + break; + case clang::X86::BI__builtin_ia32_phsubsw128: + case clang::X86::BI__builtin_ia32_phsubsw256: + ResultElements.push_back(APValue( + APSInt(RHSA.isSigned() ? RHSA.ssub_sat(RHSB) : RHSA.usub_sat(RHSB), + DestUnsigned))); + break; + } + } + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } + case clang::X86::BI__builtin_ia32_haddpd: + case clang::X86::BI__builtin_ia32_haddpd256: + case clang::X86::BI__builtin_ia32_haddps: + case clang::X86::BI__builtin_ia32_haddps256: { + APValue SourceLHS, SourceRHS; + if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) || + !EvaluateAsRValue(Info, E->getArg(1), SourceRHS)) + return false; + unsigned SourceLen = SourceLHS.getVectorLength(); + SmallVector<APValue, 4> ResultElements; + ResultElements.reserve(SourceLen); + for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) { + APFloat LHSA = SourceLHS.getVectorElt(EltNum).getFloat(); + APFloat LHSB = SourceLHS.getVectorElt(EltNum + 1).getFloat(); + LHSA.add(LHSB, APFloat::rmNearestTiesToEven); + ResultElements.push_back(APValue(LHSA)); + } + for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) { + APFloat RHSA = SourceRHS.getVectorElt(EltNum).getFloat(); + APFloat RHSB = SourceRHS.getVectorElt(EltNum + 1).getFloat(); + RHSA.add(RHSB, APFloat::rmNearestTiesToEven); + ResultElements.push_back(APValue(RHSA)); + } + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } + case clang::X86::BI__builtin_ia32_hsubpd: + case clang::X86::BI__builtin_ia32_hsubpd256: + case clang::X86::BI__builtin_ia32_hsubps: + case clang::X86::BI__builtin_ia32_hsubps256: { + APValue SourceLHS, SourceRHS; + if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) || + !EvaluateAsRValue(Info, E->getArg(1), SourceRHS)) + return false; + unsigned SourceLen = SourceLHS.getVectorLength(); + SmallVector<APValue, 4> ResultElements; + ResultElements.reserve(SourceLen); + for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) { + APFloat LHSA = SourceLHS.getVectorElt(EltNum).getFloat(); + APFloat LHSB = SourceLHS.getVectorElt(EltNum + 1).getFloat(); + LHSA.subtract(LHSB, APFloat::rmNearestTiesToEven); + ResultElements.push_back(APValue(LHSA)); + } + for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) { + APFloat RHSA = SourceRHS.getVectorElt(EltNum).getFloat(); + APFloat RHSB = SourceRHS.getVectorElt(EltNum + 1).getFloat(); + RHSA.subtract(RHSB, APFloat::rmNearestTiesToEven); + ResultElements.push_back(APValue(RHSA)); + } + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } } } @@ -12197,67 +12337,71 @@ bool VectorExprEvaluator::VisitShuffleVectorExpr(const ShuffleVectorExpr *E) { namespace { class ArrayExprEvaluator - : public ExprEvaluatorBase<ArrayExprEvaluator> { - const LValue &This; - APValue &Result; - public: - - ArrayExprEvaluator(EvalInfo &Info, const LValue &This, APValue &Result) - : ExprEvaluatorBaseTy(Info), This(This), Result(Result) {} - - bool Success(const APValue &V, const Expr *E) { - assert(V.isArray() && "expected array"); - Result = V; - return true; - } - - bool ZeroInitialization(const Expr *E) { - const ConstantArrayType *CAT = - Info.Ctx.getAsConstantArrayType(E->getType()); - if (!CAT) { - if (E->getType()->isIncompleteArrayType()) { - // We can be asked to zero-initialize a flexible array member; this - // is represented as an ImplicitValueInitExpr of incomplete array - // type. In this case, the array has zero elements. - Result = APValue(APValue::UninitArray(), 0, 0); - return true; - } - // FIXME: We could handle VLAs here. - return Error(E); - } - - Result = APValue(APValue::UninitArray(), 0, CAT->getZExtSize()); - if (!Result.hasArrayFiller()) - return true; - - // Zero-initialize all elements. - LValue Subobject = This; - Subobject.addArray(Info, E, CAT); - ImplicitValueInitExpr VIE(CAT->getElementType()); - return EvaluateInPlace(Result.getArrayFiller(), Info, Subobject, &VIE); - } - - bool VisitCallExpr(const CallExpr *E) { - return handleCallExpr(E, Result, &This); - } - bool VisitInitListExpr(const InitListExpr *E, - QualType AllocType = QualType()); - bool VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E); - bool VisitCXXConstructExpr(const CXXConstructExpr *E); - bool VisitCXXConstructExpr(const CXXConstructExpr *E, - const LValue &Subobject, - APValue *Value, QualType Type); - bool VisitStringLiteral(const StringLiteral *E, - QualType AllocType = QualType()) { - expandStringLiteral(Info, E, Result, AllocType); - return true; - } - bool VisitCXXParenListInitExpr(const CXXParenListInitExpr *E); - bool VisitCXXParenListOrInitListExpr(const Expr *ExprToVisit, - ArrayRef<Expr *> Args, - const Expr *ArrayFiller, - QualType AllocType = QualType()); - }; + : + public + ExprEvaluatorBase<ArrayExprEvaluator> { + const LValue &This; + APValue & Result; + + public: + ArrayExprEvaluator(EvalInfo & Info, const LValue &This, + APValue &Result) + : ExprEvaluatorBaseTy(Info), This(This), Result(Result) {} + + bool Success(const APValue &V, const Expr *E) { + assert(V.isArray() && "expected array"); + Result = V; + return true; + } + + bool ZeroInitialization(const Expr *E) { + const ConstantArrayType *CAT = + Info.Ctx.getAsConstantArrayType(E->getType()); + if (!CAT) { + if (E->getType()->isIncompleteArrayType()) { + // We can be asked to zero-initialize a flexible array member; + // this is represented as an ImplicitValueInitExpr of + // incomplete array type. In this case, the array has zero + // elements. + Result = APValue(APValue::UninitArray(), 0, 0); + return true; + } + // FIXME: We could handle VLAs here. + return Error(E); + } + + Result = APValue(APValue::UninitArray(), 0, CAT->getZExtSize()); + if (!Result.hasArrayFiller()) + return true; + + // Zero-initialize all elements. + LValue Subobject = This; + Subobject.addArray(Info, E, CAT); + ImplicitValueInitExpr VIE(CAT->getElementType()); + return EvaluateInPlace(Result.getArrayFiller(), Info, Subobject, + &VIE); + } + + bool VisitCallExpr(const CallExpr *E) { + return handleCallExpr(E, Result, &This); + } + bool VisitInitListExpr(const InitListExpr *E, + QualType AllocType = QualType()); + bool VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E); + bool VisitCXXConstructExpr(const CXXConstructExpr *E); + bool VisitCXXConstructExpr(const CXXConstructExpr *E, + const LValue &Subobject, APValue *Value, + QualType Type); + bool VisitStringLiteral(const StringLiteral *E, + QualType AllocType = QualType()) { + expandStringLiteral(Info, E, Result, AllocType); + return true; + } + bool VisitCXXParenListInitExpr(const CXXParenListInitExpr *E); + bool VisitCXXParenListOrInitListExpr( + const Expr *ExprToVisit, ArrayRef<Expr *> Args, + const Expr *ArrayFiller, QualType AllocType = QualType()); + }; } // end anonymous namespace static bool EvaluateArray(const Expr *E, const LValue &This, >From df6242e4b74e8170cd28a2f9663aa974a4b0b12b Mon Sep 17 00:00:00 2001 From: whyuuwang <whyuuw...@tencent.com> Date: Sat, 13 Sep 2025 20:58:15 +0800 Subject: [PATCH 4/4] adjust unit test #146940 --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 162 ++++++++++++----------- clang/test/CodeGen/X86/avx-builtins.c | 39 ++---- clang/test/CodeGen/X86/avx2-builtins.c | 87 ++++-------- clang/test/CodeGen/X86/mmx-builtins.c | 54 +------- clang/test/CodeGen/X86/ssse3-builtins.c | 54 +------- 5 files changed, 128 insertions(+), 268 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index f6027c78935c3..9d5d70698b8d3 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2739,9 +2739,9 @@ static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC, } static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC, - const InterpFrame *Frame, - const CallExpr *Call, - uint32_t BuiltinID) { + const InterpFrame *Frame, + const CallExpr *Call, + uint32_t BuiltinID) { assert(Call->getArg(0)->getType()->isVectorType() && Call->getArg(1)->getType()->isVectorType()); const Pointer &RHS = S.Stk.pop<Pointer>(); @@ -2752,7 +2752,8 @@ static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC, PrimType ElemT = *S.getContext().classify(VT->getElementType()); unsigned SourceLen = VT->getNumElements(); assert(SourceLen % 2 == 0 && - Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() == SourceLen); + Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() == + SourceLen); PrimType DstElemT = *S.getContext().classify( Call->getType()->castAs<VectorType>()->getElementType()); unsigned DstElem = 0; @@ -2774,16 +2775,17 @@ static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC, APSInt Elem2; INT_TYPE_SWITCH_NO_BOOL(ElemT, { Elem1 = LHS.elem<T>(I).toAPSInt(); - Elem2 = LHS.elem<T>(I+1).toAPSInt(); + Elem2 = LHS.elem<T>(I + 1).toAPSInt(); }); APSInt Result; if (IsAdd) { - if (IsSaturating) { - Result = APSInt(Elem1.sadd_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned()); - }else{ - Result = APSInt(Elem1 + Elem2, /*IsUnsigned=*/Elem1.isUnsigned()); - } - }else{ + if (IsSaturating) { + Result = + APSInt(Elem1.sadd_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned()); + } else { + Result = APSInt(Elem1 + Elem2, /*IsUnsigned=*/Elem1.isUnsigned()); + } + } else { if (IsSaturating) { Result = APSInt(Elem1.ssub_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned()); @@ -2812,7 +2814,8 @@ static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC, } } else { if (IsSaturating) { - Result = APSInt(Elem1.ssub_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned()); + Result = + APSInt(Elem1.ssub_sat(Elem2), /*IsUnsigned=*/Elem1.isUnsigned()); } else { Result = APSInt(Elem1 - Elem2, /*IsUnsigned=*/Elem1.isUnsigned()); } @@ -2826,15 +2829,15 @@ static bool interp_builtin_ia32ph_add_sub(InterpState &S, CodePtr OpPC, } static bool interp_builtin_floatph_add_sub(InterpState &S, CodePtr OpPC, - const InterpFrame *Frame, - const CallExpr *Call, - uint32_t BuiltinID) { + const InterpFrame *Frame, + const CallExpr *Call, + uint32_t BuiltinID) { assert(Call->getArg(0)->getType()->isVectorType() && Call->getArg(1)->getType()->isVectorType()); const Pointer &RHS = S.Stk.pop<Pointer>(); const Pointer &LHS = S.Stk.pop<Pointer>(); const Pointer &Dst = S.Stk.peek<Pointer>(); - + FPOptions FPO = Call->getFPFeaturesInEffect(S.Ctx.getLangOpts()); llvm::RoundingMode RM = getRoundingMode(FPO); const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>(); @@ -2851,7 +2854,6 @@ static bool interp_builtin_floatph_add_sub(InterpState &S, CodePtr OpPC, for (unsigned I = 0; I != SourceLen; I += 2) { APFloat Elem1 = LHS.elem<T>(I).getAPFloat(); APFloat Elem2 = LHS.elem<T>(I + 1).getAPFloat(); - if (IsAdd) { Elem1.add(Elem2, RM); } else { @@ -3495,71 +3497,71 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case clang::X86::BI__builtin_ia32_phaddw128: case clang::X86::BI__builtin_ia32_phaddw256: - case clang::X86::BI__builtin_ia32_phaddd128: - case clang::X86::BI__builtin_ia32_phaddd256: - case clang::X86::BI__builtin_ia32_phaddsw128: - case clang::X86::BI__builtin_ia32_phaddsw256: - case clang::X86::BI__builtin_ia32_phsubw128: - case clang::X86::BI__builtin_ia32_phsubw256: - case clang::X86::BI__builtin_ia32_phsubd128: - case clang::X86::BI__builtin_ia32_phsubd256: - case clang::X86::BI__builtin_ia32_phsubsw128: - case clang::X86::BI__builtin_ia32_phsubsw256: - - return interp_builtin_ia32ph_add_sub(S, OpPC, Frame, Call, BuiltinID); - case clang::X86::BI__builtin_ia32_haddpd: - case clang::X86::BI__builtin_ia32_haddpd256: - case clang::X86::BI__builtin_ia32_haddps: - case clang::X86::BI__builtin_ia32_haddps256: - case clang::X86::BI__builtin_ia32_hsubpd: - case clang::X86::BI__builtin_ia32_hsubpd256: - case clang::X86::BI__builtin_ia32_hsubps: - case clang::X86::BI__builtin_ia32_hsubps256: - return interp_builtin_floatph_add_sub(S, OpPC, Frame, Call, BuiltinID); - - case clang::X86::BI__builtin_ia32_pmuldq128: - case clang::X86::BI__builtin_ia32_pmuldq256: - case clang::X86::BI__builtin_ia32_pmuldq512: - case clang::X86::BI__builtin_ia32_pmuludq128: - case clang::X86::BI__builtin_ia32_pmuludq256: - case clang::X86::BI__builtin_ia32_pmuludq512: - return interp__builtin_ia32_pmul(S, OpPC, Call, BuiltinID); - - case Builtin::BI__builtin_elementwise_fma: - return interp__builtin_elementwise_fma(S, OpPC, Call); - - case X86::BI__builtin_ia32_selectb_128: - case X86::BI__builtin_ia32_selectb_256: - case X86::BI__builtin_ia32_selectb_512: - case X86::BI__builtin_ia32_selectw_128: - case X86::BI__builtin_ia32_selectw_256: - case X86::BI__builtin_ia32_selectw_512: - case X86::BI__builtin_ia32_selectd_128: - case X86::BI__builtin_ia32_selectd_256: - case X86::BI__builtin_ia32_selectd_512: - case X86::BI__builtin_ia32_selectq_128: - case X86::BI__builtin_ia32_selectq_256: - case X86::BI__builtin_ia32_selectq_512: - case X86::BI__builtin_ia32_selectph_128: - case X86::BI__builtin_ia32_selectph_256: - case X86::BI__builtin_ia32_selectph_512: - case X86::BI__builtin_ia32_selectpbf_128: - case X86::BI__builtin_ia32_selectpbf_256: - case X86::BI__builtin_ia32_selectpbf_512: - case X86::BI__builtin_ia32_selectps_128: - case X86::BI__builtin_ia32_selectps_256: - case X86::BI__builtin_ia32_selectps_512: - case X86::BI__builtin_ia32_selectpd_128: - case X86::BI__builtin_ia32_selectpd_256: - case X86::BI__builtin_ia32_selectpd_512: - return interp__builtin_select(S, OpPC, Call); + case clang::X86::BI__builtin_ia32_phaddd128: + case clang::X86::BI__builtin_ia32_phaddd256: + case clang::X86::BI__builtin_ia32_phaddsw128: + case clang::X86::BI__builtin_ia32_phaddsw256: + case clang::X86::BI__builtin_ia32_phsubw128: + case clang::X86::BI__builtin_ia32_phsubw256: + case clang::X86::BI__builtin_ia32_phsubd128: + case clang::X86::BI__builtin_ia32_phsubd256: + case clang::X86::BI__builtin_ia32_phsubsw128: + case clang::X86::BI__builtin_ia32_phsubsw256: + return interp_builtin_ia32ph_add_sub(S, OpPC, Frame, Call, BuiltinID); + + case clang::X86::BI__builtin_ia32_haddpd: + case clang::X86::BI__builtin_ia32_haddpd256: + case clang::X86::BI__builtin_ia32_haddps: + case clang::X86::BI__builtin_ia32_haddps256: + case clang::X86::BI__builtin_ia32_hsubpd: + case clang::X86::BI__builtin_ia32_hsubpd256: + case clang::X86::BI__builtin_ia32_hsubps: + case clang::X86::BI__builtin_ia32_hsubps256: + return interp_builtin_floatph_add_sub(S, OpPC, Frame, Call, BuiltinID); + + case clang::X86::BI__builtin_ia32_pmuldq128: + case clang::X86::BI__builtin_ia32_pmuldq256: + case clang::X86::BI__builtin_ia32_pmuldq512: + case clang::X86::BI__builtin_ia32_pmuludq128: + case clang::X86::BI__builtin_ia32_pmuludq256: + case clang::X86::BI__builtin_ia32_pmuludq512: + return interp__builtin_ia32_pmul(S, OpPC, Call, BuiltinID); + + case Builtin::BI__builtin_elementwise_fma: + return interp__builtin_elementwise_fma(S, OpPC, Call); + + case X86::BI__builtin_ia32_selectb_128: + case X86::BI__builtin_ia32_selectb_256: + case X86::BI__builtin_ia32_selectb_512: + case X86::BI__builtin_ia32_selectw_128: + case X86::BI__builtin_ia32_selectw_256: + case X86::BI__builtin_ia32_selectw_512: + case X86::BI__builtin_ia32_selectd_128: + case X86::BI__builtin_ia32_selectd_256: + case X86::BI__builtin_ia32_selectd_512: + case X86::BI__builtin_ia32_selectq_128: + case X86::BI__builtin_ia32_selectq_256: + case X86::BI__builtin_ia32_selectq_512: + case X86::BI__builtin_ia32_selectph_128: + case X86::BI__builtin_ia32_selectph_256: + case X86::BI__builtin_ia32_selectph_512: + case X86::BI__builtin_ia32_selectpbf_128: + case X86::BI__builtin_ia32_selectpbf_256: + case X86::BI__builtin_ia32_selectpbf_512: + case X86::BI__builtin_ia32_selectps_128: + case X86::BI__builtin_ia32_selectps_256: + case X86::BI__builtin_ia32_selectps_512: + case X86::BI__builtin_ia32_selectpd_128: + case X86::BI__builtin_ia32_selectpd_256: + case X86::BI__builtin_ia32_selectpd_512: + return interp__builtin_select(S, OpPC, Call); - default: - S.FFDiag(S.Current->getLocation(OpPC), - diag::note_invalid_subexpr_in_const_expr) - << S.Current->getRange(OpPC); + default: + S.FFDiag(S.Current->getLocation(OpPC), + diag::note_invalid_subexpr_in_const_expr) + << S.Current->getRange(OpPC); - return false; + return false; } llvm_unreachable("Unhandled builtin ID"); diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c index 9857b84c94112..4e21cfea41553 100644 --- a/clang/test/CodeGen/X86/avx-builtins.c +++ b/clang/test/CodeGen/X86/avx-builtins.c @@ -1083,53 +1083,34 @@ __m256d test_mm256_hadd_pd(__m256d A, __m256d B) { // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_hadd_pd(A, B); } -constexpr bool test_mm256_hadd_epi32_constexpr() { - constexpr __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0); - constexpr __m256d b = _mm256_set_pd(5.0, 6.0, 7.0, 8.0); - constexpr __m256d result = _mm256_hadd_pd(a, b); - return match_m256d(result,1.0+2.0,3.0+4.0,5.0+6.0,7.0+8.0); -} -TEST_CONSTEXPR(test_mm256_hadd_epi32_constexpr()) +TEST_CONSTEXPR(match_m256d(_mm256_hadd_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__m256d){5.0, 6.0, 7.0, 8.0}), 3.0, 7.0, 11.0, 15.0)); __m256 test_mm256_hadd_ps(__m256 A, __m256 B) { // CHECK-LABEL: test_mm256_hadd_ps // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_hadd_ps(A, B); } -constexpr bool test_mm256_hadd_ps_constexpr() { - constexpr __m256 a = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); - constexpr __m256 b = _mm256_set_ps(9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f); - constexpr __m256 result = _mm256_hadd_ps(a, b); - return match_m256(result,1.0f+2.0f,3.0f+4.0f,5.0f+6.0f,7.0f+8.0f, - 9.0f+10.0f,11.0f+12.0f,13.0f+14.0f,15.0f+16.0f); -} -TEST_CONSTEXPR(test_mm256_hadd_ps_constexpr()) +TEST_CONSTEXPR(_mm256_hadd_ps( + (__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, + (__m256){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}, + 3.0f, 7.0f, 11.0f, 15.0f, 19.0f, 23.0f, 27.0f, 31.0f)) __m256d test_mm256_hsub_pd(__m256d A, __m256d B) { // CHECK-LABEL: test_mm256_hsub_pd // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}) return _mm256_hsub_pd(A, B); } -constexpr bool test_mm256_hsub_pd_constexpr() { - constexpr __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0); - constexpr __m256d b = _mm256_set_pd(5.0, 6.0, 7.0, 8.0); - constexpr __m256d result = _mm256_hsub_pd(a, b); - return match_m256d(result,1.0-2.0,3.0-4.0,5.0-6.0,7.0-8.0); -} -TEST_CONSTEXPR(test_mm256_hsub_pd_constexpr()) +TEST_CONSTEXPR(match_m256d(_mm256_hsub_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__m256d){5.0, 6.0, 7.0, 8.0}), -1.0,-1.0,-1.0,-1.0)); __m256 test_mm256_hsub_ps(__m256 A, __m256 B) { // CHECK-LABEL: test_mm256_hsub_ps // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}) return _mm256_hsub_ps(A, B); } -constexpr bool test_mm256_hsub_ps_constexpr() { - constexpr __m256 a = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); - constexpr __m256 b = _mm256_set_ps(9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f); - constexpr __m256 result = _mm256_hsub_ps(a, b); - return match_m256(result,1.0f-2.0f,3.0f-4.0f,5.0f-6.0f,7.0f-8.0f, - 9.0f-10.0f,11.0f-12.0f,13.0f-14.0f,15.0f-16.0f); -} +TEST_CONSTEXPR(_mm256_hsub_ps( + (__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, + (__m256){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}, + -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f)) __m256i test_mm256_insert_epi8(__m256i x, char b) { // CHECK-LABEL: test_mm256_insert_epi8 diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index c34594cf78a8e..a9095de4fe373 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -461,99 +461,60 @@ __m256i test_mm256_hadd_epi16(__m256i a, __m256i b) { // CHECK: call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_hadd_epi16(a, b); } - -constexpr bool test_mm256_hadd_epi16_constexpr() { - constexpr __m256i a = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, - 8,9,10,11,12,13,14,15,16); - constexpr __m256i b = _mm256_setr_epi16(17,18,19,20,21,22,23, - 24,25,26,27,28,29,30,31,32); - - constexpr __m256i result = _mm256_hadd_epi16(a, b); - return match_v16si(result,1+2,3+4,5+6,7+8,9+10,11+12,13+14,15+16,17+18,19+20,21+22,23+24,25+26,27+28,29+30,31+32); -} -TEST_CONSTEXPR(test_mm256_hadd_epi16_constexpr()) +TEST_CONSTEXPR(match_v16hi(_mm256_hadd_epi16( + (__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, + (__m256i)(__v16hi){17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32}), + 3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63)); __m256i test_mm256_hadd_epi32(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_hadd_epi32 // CHECK: call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}) return _mm256_hadd_epi32(a, b); } - -constexpr bool test_mm256_hadd_epi32_constexpr() { - constexpr __m256i a = _mm256_setr_epi32(10, 20, 30, 40, 50, 60, 70, 80); - constexpr __m256i b = _mm256_setr_epi32(5, 15, 25, 35, 45, 55, 65, 75); - - constexpr __m256i result = _mm256_hadd_epi32(a, b); - return match_v8si(result,10+20,30+40,50+60,70+80,5+15,25+35, 45+55,65+75); -} -TEST_CONSTEXPR(test_mm256_hadd_epi32_constexpr()) +TEST_CONSTEXPR(match_v8si(_mm256_hadd_epi32( + (__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80}, + (__m256i)(__v8si){5, 15, 25, 35, 45, 55, 65, 75}) + 30,70,110,150,20,60,100,140)) __m256i test_mm256_hadds_epi16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_hadds_epi16 // CHECK:call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_hadds_epi16(a, b); } -constexpr bool test_mm256_hadds_epi16_constexpr() { - constexpr __m256i a = _mm256_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, - 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); - constexpr __m256i b = _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1); - constexpr __m256i result = _mm256_hadds_epi16(a, b); - - return match_v16si(result, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, - 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); -} -TEST_CONSTEXPR(test_mm256_hadds_epi16_constexpr()) +TEST_CONSTEXPR(match_v16hi( _mm256_hadds_epi16( + (__m256i)(__v16hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767}, + (__m256i)(__v16hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767}, + 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767))) __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_hsub_epi16 // CHECK: call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_hsub_epi16(a, b); } - -constexpr bool test_mm256_hsub_epi16_constexpr() { - constexpr __m256i a = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, - 8,9,10,11,12,13,14,15,16); - constexpr __m256i b = _mm256_setr_epi16(17,18,19,20,21,22,23, - 24,25,26,27,28,29,30,31,32); - - constexpr __m256i result = _mm256_hsub_epi16(a, b); - return match_v16si(result,1-2,3-4,5-6,7-8,9-10,11-12,13-14,15-16,17-18,19-20,21-22,23-24,25-26,27-28,29-30,31-32); -} -TEST_CONSTEXPR(test_mm256_hsub_epi16_constexpr()) +TEST_CONSTEXPR(match_v16hi(_mm256_hsub_epi16( + (__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, + (__m256i)(__v16hi){17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32}), + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1)); __m256i test_mm256_hsub_epi32(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_hsub_epi32 // CHECK: call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}) return _mm256_hsub_epi32(a, b); } - -constexpr bool test_mm256_hsub_epi32_constexpr() { - constexpr __m256i a = _mm256_setr_epi32(10, 20, 30, 40, 50, 60, 70, 80); - constexpr __m256i b = _mm256_setr_epi32(5, 15, 25, 35, 45, 55, 65, 75); - - constexpr __m256i result = _mm256_hsub_epi32(a, b); - return match_v8si(result,10-20,30-40,50-60,70-80,5-15,25-35, 45-55,65-75); -} -TEST_CONSTEXPR(test_mm256_hsub_epi32_constexpr()) +TEST_CONSTEXPR(match_v8si(_mm256_hsub_epi32( + (__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80}, + (__m256i)(__v8si){5, 15, 25, 35, 45, 55, 65, 75}) + -10,-10,-10,-10,-10,-10,-10,-10)) __m256i test_mm256_hsubs_epi16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_hsubs_epi16 // CHECK:call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_hsubs_epi16(a, b); } - -constexpr bool test_mm256_hsubs_epi16_constexpr() { - constexpr __m256i a = _mm256_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, - 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); - constexpr __m256i b = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - constexpr __m256i result3 = _mm256_hsubs_epi16(a, b); - - return match_v16si(result3, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, - 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); -} -TEST_CONSTEXPR(test_mm256_hsubs_epi16_constexpr()) - +TEST_CONSTEXPR(match_v16hi( _mm256_hsubs_epi16( + (__m256i)(__v16hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767}, + (__m256i)(__v16hi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767))) __m128i test_mm_i32gather_epi32(int const *b, __m128i c) { // CHECK-LABEL: test_mm_i32gather_epi32 diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c index 70f521e380dd4..944af98ffcadc 100644 --- a/clang/test/CodeGen/X86/mmx-builtins.c +++ b/clang/test/CodeGen/X86/mmx-builtins.c @@ -309,84 +309,42 @@ __m64 test_mm_hadd_pi16(__m64 a, __m64 b) { // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128( return _mm_hadd_pi16(a, b); } -constexpr bool test_mm_hadd_pi16_constexpr() { - constexpr __m64 a = _mm_setr_pi16(1, 2, 3, 4); - constexpr __m64 b = _mm_setr_pi16(5,6,7,8); - - constexpr __m64 result = _mm_hadd_pi16(a, b); - return match_v4si(result,1+2,3+4,5+6,7+8); -} -TEST_CONSTEXPR(test_mm_hadd_pi16_constexpr()) +TEST_CONSTEXPR(match_v4hi(_mm_hadd_pi16((__m64)(__v4hi){1,2,3,4},(__m64)(__v4hi){5,6,7,8}),3,7,11,15)); __m64 test_mm_hadd_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hadd_pi32 // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128( return _mm_hadd_pi32(a, b); } -constexpr bool test_mm_hadd_pi32_constexpr() { - constexpr __m64 a = _mm_setr_pi32(1, 2); - constexpr __m64 b = _mm_setr_pi32(3, 4); - - constexpr __m64 result = _mm_hadd_pi32(a, b); - return match_v2si(result,1+2,3+4); -} -TEST_CONSTEXPR(test_mm_hadd_pi32_constexpr()) +TEST_CONSTEXPR(match_v2si(_mm_hadd_pi32((__m64)(__v2si){1,2},(__m64)(__v2si){3,4}),3,7)); __m64 test_mm_hadds_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hadds_pi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128( return _mm_hadds_pi16(a, b); } -constexpr bool test_mm_hadds_pi16_constexpr() { - constexpr __m64 a = _mm_setr_pi16(32767, 32767, 32767, 32767); - constexpr __m64 b = _mm_setr_pi16(1,1,1,1); - - constexpr __m64 result = _mm_hadds_pi16(a, b); - return match_v4si(result,32767, 32767, 32767, 32767); -} -TEST_CONSTEXPR(test_mm_hadds_pi16_constexpr()) +TEST_CONSTEXPR(match_v4hi(_mm_hadds_pi16((__m64)(__v4hi){32767, 32767, 32767, 32767},(__m64)(__v4hi){32767, 32767, 32767, 32767}),32767, 32767, 32767, 32767)); __m64 test_mm_hsub_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hsub_pi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128( return _mm_hsub_pi16(a, b); } -constexpr bool test_mm_hsub_pi16_constexpr() { - constexpr __m64 a = _mm_setr_pi16(1, 2, 3, 4); - constexpr __m64 b = _mm_setr_pi16(5,6,7,8); - - constexpr __m64 result = _mm_hsub_pi16(a, b); - return match_v4si(result,1-2,3-4,5-6,7-8); -} -TEST_CONSTEXPR(test_mm_hsub_pi16_constexpr()) +TEST_CONSTEXPR(match_v4hi(_mm_hsub_pi16((__m64)(__v4hi){1,2,3,4},(__m64)(__v4hi){5,6,7,8}),-1,-1,-1,-1)); __m64 test_mm_hsub_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hsub_pi32 // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128( return _mm_hsub_pi32(a, b); } -constexpr bool test_mm_hsub_pi32_constexpr() { - constexpr __m64 a = _mm_setr_pi32(1, 2); - constexpr __m64 b = _mm_setr_pi32(3, 4); - - constexpr __m64 result = _mm_hsub_pi32(a, b); - return match_v2si(result,1-2,3-4); -} -TEST_CONSTEXPR(test_mm_hsub_pi32_constexpr()) +TEST_CONSTEXPR(match_v2si(_mm_hsub_pi32((__m64)(__v2si){1,2},(__m64)(__v2si){3,4}),-1,-1)); __m64 test_mm_hsubs_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hsubs_pi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128( return _mm_hsubs_pi16(a, b); } -constexpr bool test_mm_hsubs_pi16_constexpr() { - constexpr __m64 a = _mm_setr_pi16(32767, 32767, 32767, 32767); - constexpr __m64 b = _mm_setr_pi16(-1,-1,-1,-1); - - constexpr __m64 result = _mm_hsubs_pi16(a, b); - return match_v4si(result,32767, 32767, 32767, 32767); -} -TEST_CONSTEXPR(test_mm_hsubs_pi16_constexpr()) +TEST_CONSTEXPR(match_v4hi(_mm_hsubs_pi16((__m64)(__v4hi){32767, 32767, 32767, 32767},(__m64)(__v4hi){-4,-5,-6,-7}),32767, 32767, 32767, 32767)); __m64 test_mm_insert_pi16(__m64 a, int d) { // CHECK-LABEL: test_mm_insert_pi16 diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c index bd0ef43278217..61c7ee31af96f 100644 --- a/clang/test/CodeGen/X86/ssse3-builtins.c +++ b/clang/test/CodeGen/X86/ssse3-builtins.c @@ -60,42 +60,21 @@ __m128i test_mm_hadd_epi16(__m128i a, __m128i b) { // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_hadd_epi16(a, b); } -constexpr bool test_mm_hadd_epi16_constexpr() { - constexpr __m128i a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); - constexpr __m128i b = _mm_setr_epi16(17,18,19,20,21,22,23,24); - - constexpr __m128i result = _mm_hadd_epi16(a, b); - return match_v8si(result,1+2,3+4,5+6,7+8,17+18,19+20,21+22,23+24); -} -TEST_CONSTEXPR(test_mm_hadd_epi16_constexpr()) +TEST_CONSTEXPR(match_v8hi(_mm_hadd_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8}, (__m128i)(__v8hi){17,18,19,20,21,22,23,24}), 3,7,11,15,35,39,43,47)); __m128i test_mm_hadd_epi32(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_hadd_epi32 // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) return _mm_hadd_epi32(a, b); } -constexpr bool test_mm_hadd_epi32_constexpr() { - constexpr __m128i a = _mm_setr_epi32(1, 2, 3, 4); - constexpr __m128i b = _mm_setr_epi32(5,6,7,8); - - constexpr __m128i result = _mm_hadd_epi32(a, b); - return match_v4si(result,1+2,3+4,5+6,7+8); -} -TEST_CONSTEXPR(test_mm_hadd_epi32_constexpr()) +TEST_CONSTEXPR(match_v4si(_mm_hadd_epi32((__m128i)(__v4si){1,2,3,4}, (__m128i)(__v4si){5,6,7,8}), 3,7,11,15)); __m128i test_mm_hadds_epi16(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_hadds_epi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_hadds_epi16(a, b); } -constexpr bool test_mm_hadds_epi16_constexpr() { - constexpr __m128i a = _mm_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); - constexpr __m128i b = _mm_setr_epi16(1, 1, 1, 1, 1, 1, 1, 1); - constexpr __m128i result = _mm_hadds_epi16(a, b); - - return match_v8si(result, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); -} -TEST_CONSTEXPR(test_mm_hadds_epi16_constexpr()) +TEST_CONSTEXPR(match_v8hi(_mm_hadds_epi16((__m128i)(__v8hi){30000,30000,30000,30000,30000,30000,30000,30000}, (__m128i)(__v8hi){30000,30000,30000,30000,30000,30000,30000,30000}), 32767,32767,32767,32767,32767,32767,32767,32767)); __m128i test_mm_hsub_epi16(__m128i a, __m128i b) { @@ -103,42 +82,21 @@ __m128i test_mm_hsub_epi16(__m128i a, __m128i b) { // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_hsub_epi16(a, b); } -constexpr bool test_mm_hsub_epi16_constexpr() { - constexpr __m128i a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); - constexpr __m128i b = _mm_setr_epi16(9,10,11,12,13,14,15,16); - - constexpr __m128i result = _mm_hsub_epi16(a, b); - return match_v8si(result,1-2,3-4,5-6,7-8,9-10,11-12,13-14,15-16); -} -TEST_CONSTEXPR(test_mm_hsub_epi16_constexpr()) +TEST_CONSTEXPR(match_v8hi(_mm_hsub_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8}, (__m128i)(__v8hi){9,10,11,12,13,14,15,16}), -1,-1,-1,-1,-1,-1,-1,-1)); __m128i test_mm_hsub_epi32(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_hsub_epi32 // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) return _mm_hsub_epi32(a, b); } -constexpr bool test_mm_hsub_epi32_constexpr() { - constexpr __m128i a = _mm_setr_epi32(1, 2, 3, 4); - constexpr __m128i b = _mm_setr_epi32(5,6,7,8); - - constexpr __m128i result = _mm_hsub_epi32(a, b); - return match_v4si(result,1-2,3-4,5-6,7-8); -} -TEST_CONSTEXPR(test_mm_hsub_epi32_constexpr()) +TEST_CONSTEXPR(match_v4si(_mm_hsub_epi32((__m128i)(__v4si){4,3,2,1}, (__m128i)(__v4si){8,7,6,5}), 1,1,1,1)) __m128i test_mm_hsubs_epi16(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_hsubs_epi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_hsubs_epi16(a, b); } -constexpr bool test_mm_hsubs_epi16_constexpr() { - constexpr __m128i a = _mm_setr_epi16(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); - constexpr __m128i b = _mm_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1); - constexpr __m128i result3 = _mm_hsubs_epi16(a, b); - - return match_v8si(result3, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767); -} -TEST_CONSTEXPR(test_mm_hsubs_epi16_constexpr()) +TEST_CONSTEXPR(match_v8hi(_mm_hsubs_epi16((__m128i)(__v8hi){32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767},(__m128i)(__v8hi){-1,-1,-1,-1,-1,-1,-1,-1}), 32767,32767,32767,32767,32767,32767,32767,32767)); __m128i test_mm_maddubs_epi16(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_maddubs_epi16 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits