D intrinsics to be used in constexpr #155395 (PR #156822)

via cfe-commits Thu, 04 Sep 2025 10:45:25 -0700

https://github.com/whytolearn created 
https://github.com/llvm/llvm-project/pull/156822


[Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - add MMX/SSE/AVX 
PHADD/SUB & HADDPS/D intrinsics to be used in constexpr #155395

cover func: 
_mm_hadd_pi16 _mm_hadd_epi16 _mm256_hadd_epi16 
_mm_hadd_pi32 _mm_hadd_epi32 _mm256_hadd_epi32
_mm_hadds_pi16 _mm_hadds_epi16 _mm256_hadds_epi16 

_mm_hsub_pi16 _mm_hsub_epi16 _mm256_hsub_epi16 
_mm_hsub_pi32 _mm_hsub_epi32 _mm256_hsub_epi32
_mm_hsubs_pi16 _mm_hsubs_epi16 _mm256_hsubs_epi16 

_mm_hadd_pd _mm256_hadd_pd
_mm_hadd_ps _mm256_hadd_ps
_mm_hsub_pd _mm256_hsub_pd
_mm_hsub_ps _mm256_hsub_ps  


>From a81c4068096b960de65c3517f18d2d31004afbce Mon Sep 17 00:00:00 2001
From: whyuuwang <[email protected]>
Date: Thu, 4 Sep 2025 15:52:57 +0800
Subject: [PATCH 1/2] deal this issues 155395
 VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - add MMX/SSE/AVX
 PHADD/SUB & HADDPS/D intrinsics to be used in constexpr #155395

---
 clang/lib/Headers/avx2intrin.h          | 27 ++++-----
 clang/lib/Headers/avxintrin.h           | 11 ++--
 clang/lib/Headers/pmmintrin.h           | 20 +++----
 clang/lib/Headers/tmmintrin.h           | 80 +++++++++++--------------
 clang/test/CodeGen/X86/avx-builtins.c   | 29 +++++++++
 clang/test/CodeGen/X86/avx2-builtins.c  | 63 +++++++++++++++++++
 clang/test/CodeGen/X86/mmx-builtins.c   | 48 +++++++++++++++
 clang/test/CodeGen/X86/ssse3-builtins.c | 49 +++++++++++++++
 8 files changed, 250 insertions(+), 77 deletions(-)

diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index 384faa35d246f..f8fb808f7f29c 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -854,10 +854,9 @@ _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hadd_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_hadd_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
@@ -886,7 +885,7 @@ _mm256_hadd_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_hadd_epi32(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
@@ -921,10 +920,9 @@ _mm256_hadd_epi32(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hadds_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_hadds_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
@@ -957,10 +955,9 @@ _mm256_hadds_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hsub_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_hsub_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
@@ -989,7 +986,7 @@ _mm256_hsub_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_hsub_epi32(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
@@ -1025,7 +1022,7 @@ _mm256_hsub_epi32(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_hsubs_epi16(__m256i __a, __m256i __b)
 {
     return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index 26096da949447..976710a64e80e 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -703,7 +703,7 @@ _mm256_xor_ps(__m256 __a, __m256 __b)
 ///    elements of a vector of [4 x double].
 /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
 ///    both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_hadd_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
@@ -726,9 +726,8 @@ _mm256_hadd_pd(__m256d __a, __m256d __b)
 ///    index 2, 3, 6, 7 of a vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
 ///    both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_hadd_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR 
+_mm256_hadd_ps(__m256 __a, __m256 __b) {
   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
 }
 
@@ -749,7 +748,7 @@ _mm256_hadd_ps(__m256 __a, __m256 __b)
 ///    odd-indexed elements of a vector of [4 x double].
 /// \returns A 256-bit vector of [4 x double] containing the horizontal
 ///    differences of both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_hsub_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
@@ -772,7 +771,7 @@ _mm256_hsub_pd(__m256d __a, __m256d __b)
 ///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the horizontal
 ///    differences of both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_hsub_ps(__m256 __a, __m256 __b)
 {
   return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
diff --git a/clang/lib/Headers/pmmintrin.h b/clang/lib/Headers/pmmintrin.h
index cd605df7fb52d..400b28bb877a1 100644
--- a/clang/lib/Headers/pmmintrin.h
+++ b/clang/lib/Headers/pmmintrin.h
@@ -89,9 +89,8 @@ _mm_addsub_ps(__m128 __a, __m128 __b)
 ///    destination.
 /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
 ///    both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_hadd_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR 
+_mm_hadd_ps(__m128 __a, __m128 __b) {
   return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
 }
 
@@ -174,9 +173,8 @@ _mm_moveldup_ps(__m128 __a)
 ///    A 128-bit vector of [2 x double] containing the right source operand.
 /// \returns A 128-bit vector of [2 x double] containing the alternating sums
 ///    and differences of both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_addsub_pd(__m128d __a, __m128d __b)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_addsub_pd(__m128d __a, __m128d __b) {
   return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
 }
 
@@ -197,9 +195,8 @@ _mm_addsub_pd(__m128d __a, __m128d __b)
 ///    destination.
 /// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
 ///    both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_hadd_pd(__m128d __a, __m128d __b)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hadd_pd(__m128d __a, __m128d __b) {
   return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
 }
 
@@ -220,9 +217,8 @@ _mm_hadd_pd(__m128d __a, __m128d __b)
 ///    the destination.
 /// \returns A 128-bit vector of [2 x double] containing the horizontal
 ///    differences of both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_hsub_pd(__m128d __a, __m128d __b)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hsub_pd(__m128d __a, __m128d __b) {
   return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
 }
 
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index f01c61afa8ea2..d79f7f6ea4091 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -204,10 +204,10 @@ _mm_abs_epi32(__m128i __a) {
 ///    destination.
 /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
 ///    both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hadd_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hadd_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phaddw128(
+      (__v8hi)__a, (__v8hi)__b);
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -227,10 +227,9 @@ _mm_hadd_epi16(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
 ///    both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hadd_epi32(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hadd_epi32(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -250,11 +249,10 @@ _mm_hadd_epi32(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of 
both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_hadd_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phaddw128(
-        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
+ _mm_hadd_pi16(__m64 __a, __m64 __b) {
+  return __trunc64(__builtin_ia32_phaddw128(
+      (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -274,7 +272,7 @@ _mm_hadd_pi16(__m64 __a, __m64 __b)
 ///    destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of 
both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_hadd_pi32(__m64 __a, __m64 __b)
 {
     return __trunc64(__builtin_ia32_phaddd128(
@@ -301,10 +299,9 @@ _mm_hadd_pi32(__m64 __a, __m64 __b)
 ///    destination.
 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hadds_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hadds_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Horizontally adds, with saturation, the adjacent pairs of values contained
@@ -327,7 +324,7 @@ _mm_hadds_epi16(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_hadds_pi16(__m64 __a, __m64 __b)
 {
     return __trunc64(__builtin_ia32_phaddsw128(
@@ -351,10 +348,9 @@ _mm_hadds_pi16(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 128-bit vector of [8 x i16] containing the horizontal 
differences
 ///    of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsub_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hsub_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -374,10 +370,9 @@ _mm_hsub_epi16(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 128-bit vector of [4 x i32] containing the horizontal 
differences
 ///    of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsub_epi32(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hsub_epi32(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -397,7 +392,7 @@ _mm_hsub_epi32(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_hsub_pi16(__m64 __a, __m64 __b)
 {
     return __trunc64(__builtin_ia32_phsubw128(
@@ -421,7 +416,7 @@ _mm_hsub_pi16(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_hsub_pi32(__m64 __a, __m64 __b)
 {
     return __trunc64(__builtin_ia32_phsubd128(
@@ -448,10 +443,9 @@ _mm_hsub_pi32(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsubs_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hsubs_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Horizontally subtracts, with saturation, the adjacent pairs of values
@@ -474,7 +468,7 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_hsubs_pi16(__m64 __a, __m64 __b)
 {
     return __trunc64(__builtin_ia32_phsubsw128(
@@ -509,10 +503,9 @@ _mm_hsubs_pi16(__m64 __a, __m64 __b)
 ///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
 ///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
 ///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maddubs_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_maddubs_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
 }
 
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -539,11 +532,10 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_maddubs_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
-                                                 (__v16qi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_maddubs_pi16(__m64 __a, __m64 __b) {
+  return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
+                                               (__v16qi)__anyext128(__b)));
 }
 
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
@@ -560,7 +552,7 @@ _mm_maddubs_pi16(__m64 __a, __m64 __b)
 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
 ///    products of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
diff --git a/clang/test/CodeGen/X86/avx-builtins.c 
b/clang/test/CodeGen/X86/avx-builtins.c
index 4a048744faa61..f381faebededf 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1083,24 +1083,53 @@ __m256d test_mm256_hadd_pd(__m256d A, __m256d B) {
   // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> 
%{{.*}}, <4 x double> %{{.*}})
   return _mm256_hadd_pd(A, B);
 }
+constexpr bool test_mm256_hadd_epi32_constexpr() {
+    constexpr __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
+    constexpr __m256d b = _mm256_set_pd(5.0, 6.0, 7.0, 8.0);
+    constexpr __m256d result = _mm256_hadd_pd(a, b);
+    return match_m256d(result,1.0+2.0,3.0+4.0,5.0+6.0,7.0+8.0);
+}
+TEST_CONSTEXPR(test_mm256_hadd_epi32_constexpr())
 
 __m256 test_mm256_hadd_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_hadd_ps
   // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> 
%{{.*}}, <8 x float> %{{.*}})
   return _mm256_hadd_ps(A, B);
 }
+constexpr bool test_mm256_hadd_ps_constexpr() {
+    constexpr __m256 a = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 
7.0f, 8.0f);
+    constexpr __m256 b = _mm256_set_ps(9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 
14.0f, 15.0f, 16.0f);
+    constexpr __m256 result = _mm256_hadd_ps(a, b);
+    return match_m256(result,1.0f+2.0f,3.0f+4.0f,5.0f+6.0f,7.0f+8.0f,
+                             9.0f+10.0f,11.0f+12.0f,13.0f+14.0f,15.0f+16.0f);
+}
+TEST_CONSTEXPR(test_mm256_hadd_ps_constexpr())
 
 __m256d test_mm256_hsub_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_hsub_pd
   // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> 
%{{.*}}, <4 x double> %{{.*}})
   return _mm256_hsub_pd(A, B);
 }
+constexpr bool test_mm256_hsub_pd_constexpr() {
+    constexpr __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
+    constexpr __m256d b = _mm256_set_pd(5.0, 6.0, 7.0, 8.0);
+    constexpr __m256d result = _mm256_hsub_pd(a, b);
+    return match_m256d(result,1.0-2.0,3.0-4.0,5.0-6.0,7.0-8.0);
+}
+TEST_CONSTEXPR(test_mm256_hsub_pd_constexpr())
 
 __m256 test_mm256_hsub_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_hsub_ps
   // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> 
%{{.*}}, <8 x float> %{{.*}})
   return _mm256_hsub_ps(A, B);
 }
+constexpr bool test_mm256_hsub_ps_constexpr() {
+    constexpr __m256 a = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 
7.0f, 8.0f);
+    constexpr __m256 b = _mm256_set_ps(9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 
14.0f, 15.0f, 16.0f);
+    constexpr __m256 result = _mm256_hsub_ps(a, b);
+    return match_m256(result,1.0f-2.0f,3.0f-4.0f,5.0f-6.0f,7.0f-8.0f,
+                             9.0f-10.0f,11.0f-12.0f,13.0f-14.0f,15.0f-16.0f);
+}
 
 __m256i test_mm256_insert_epi8(__m256i x, char b) {
   // CHECK-LABEL: test_mm256_insert_epi8
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c 
b/clang/test/CodeGen/X86/avx2-builtins.c
index a39ce513837ea..02845b9417a1f 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -462,17 +462,48 @@ __m256i test_mm256_hadd_epi16(__m256i a, __m256i b) {
   return _mm256_hadd_epi16(a, b);
 }
 
+constexpr bool test_mm256_hadd_epi16_constexpr() {
+    constexpr __m256i a = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 
+        8,9,10,11,12,13,14,15,16);
+    constexpr __m256i b = _mm256_setr_epi16(17,18,19,20,21,22,23,
+        24,25,26,27,28,29,30,31,32);
+    
+    constexpr __m256i result = _mm256_hadd_epi16(a, b);
+    return 
match_v16si(result,1+2,3+4,5+6,7+8,9+10,11+12,13+14,15+16,17+18,19+20,21+22,23+24,25+26,27+28,29+30,31+32);
+}
+TEST_CONSTEXPR(test_mm256_hadd_epi16_constexpr())
+
 __m256i test_mm256_hadd_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hadd_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %{{.*}}, <8 x i32> 
%{{.*}})
   return _mm256_hadd_epi32(a, b);
 }
 
+constexpr bool test_mm256_hadd_epi32_constexpr() {
+    constexpr __m256i a = _mm256_setr_epi32(10, 20, 30, 40, 50, 60, 70, 80);
+    constexpr __m256i b = _mm256_setr_epi32(5, 15, 25, 35, 45, 55, 65, 75);
+    
+    constexpr __m256i result = _mm256_hadd_epi32(a, b);
+    return match_v8si(result,10+20,30+40,50+60,70+80,5+15,25+35, 45+55,65+75);
+}
+TEST_CONSTEXPR(test_mm256_hadd_epi32_constexpr())
+
 __m256i test_mm256_hadds_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hadds_epi16
   // CHECK:call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %{{.*}}, <16 x 
i16> %{{.*}})
   return _mm256_hadds_epi16(a, b);
 }
+constexpr bool test_mm256_hadds_epi16_constexpr() {
+    constexpr __m256i a = _mm256_setr_epi16(32767, 32767, 32767, 32767, 32767, 
32767, 32767, 32767,
+        32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+    constexpr __m256i b = _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 
+        1, 1, 1, 1, 1, 1, 1);
+    constexpr __m256i result = _mm256_hadds_epi16(a, b);
+
+    return match_v16si(result, 32767, 32767, 32767, 32767, 32767, 32767, 
32767, 32767,
+    32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+}
+TEST_CONSTEXPR(test_mm256_hadds_epi16_constexpr())
 
 __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsub_epi16
@@ -480,18 +511,50 @@ __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
   return _mm256_hsub_epi16(a, b);
 }
 
+constexpr bool test_mm256_hsub_epi16_constexpr() {
+    constexpr __m256i a = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 
+        8,9,10,11,12,13,14,15,16);
+    constexpr __m256i b = _mm256_setr_epi16(17,18,19,20,21,22,23,
+        24,25,26,27,28,29,30,31,32);
+    
+    constexpr __m256i result = _mm256_hsub_epi16(a, b);
+    return 
match_v16si(result,1-2,3-4,5-6,7-8,9-10,11-12,13-14,15-16,17-18,19-20,21-22,23-24,25-26,27-28,29-30,31-32);
+}
+TEST_CONSTEXPR(test_mm256_hsub_epi16_constexpr())
+
 __m256i test_mm256_hsub_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsub_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %{{.*}}, <8 x i32> 
%{{.*}})
   return _mm256_hsub_epi32(a, b);
 }
 
+constexpr bool test_mm256_hsub_epi32_constexpr() {
+    constexpr __m256i a = _mm256_setr_epi32(10, 20, 30, 40, 50, 60, 70, 80);
+    constexpr __m256i b = _mm256_setr_epi32(5, 15, 25, 35, 45, 55, 65, 75);
+    
+    constexpr __m256i result = _mm256_hsub_epi32(a, b);
+    return match_v8si(result,10-20,30-40,50-60,70-80,5-15,25-35, 45-55,65-75);
+}
+TEST_CONSTEXPR(test_mm256_hsub_epi32_constexpr())
+
 __m256i test_mm256_hsubs_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsubs_epi16
   // CHECK:call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %{{.*}}, <16 x 
i16> %{{.*}})
   return _mm256_hsubs_epi16(a, b);
 }
 
+constexpr bool test_mm256_hsubs_epi16_constexpr() {
+    constexpr __m256i a = _mm256_setr_epi16(32767, 32767, 32767, 32767, 32767, 
32767, 32767, 32767,
+        32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+    constexpr __m256i b = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 
-1, -1, -1, -1, -1, -1, -1, -1);
+    constexpr __m256i result3 = _mm256_hsubs_epi16(a, b);
+
+    return match_v16si(result3, 32767, 32767, 32767, 32767, 32767, 32767, 
32767, 32767,
+    32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767);
+}
+TEST_CONSTEXPR(test_mm256_hsubs_epi16_constexpr())
+
+
 __m128i test_mm_i32gather_epi32(int const *b, __m128i c) {
   // CHECK-LABEL: test_mm_i32gather_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %{{.*}}, ptr 
%{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c 
b/clang/test/CodeGen/X86/mmx-builtins.c
index 7bd2475399bf9..8da0e8c814879 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -309,36 +309,84 @@ __m64 test_mm_hadd_pi16(__m64 a, __m64 b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(
   return _mm_hadd_pi16(a, b);
 }
+constexpr bool test_mm_hadd_pi16_constexpr() {
+    constexpr __m64 a = _mm_setr_pi16(1, 2, 3, 4);
+    constexpr __m64 b = _mm_setr_pi16(5,6,7,8);
+    
+    constexpr __m64 result = _mm_hadd_pi16(a, b);
+    return match_v4si(result,1+2,3+4,5+6,7+8);
+}
+TEST_CONSTEXPR(test_mm_hadd_pi16_constexpr())
 
 __m64 test_mm_hadd_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadd_pi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(
   return _mm_hadd_pi32(a, b);
 }
+constexpr bool test_mm_hadd_pi32_constexpr() {
+    constexpr __m64 a = _mm_setr_pi32(1, 2);
+    constexpr __m64 b = _mm_setr_pi32(3, 4);
+    
+    constexpr __m64 result = _mm_hadd_pi32(a, b);
+    return match_v2si(result,1+2,3+4);
+}
+TEST_CONSTEXPR(test_mm_hadd_pi32_constexpr())
 
 __m64 test_mm_hadds_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadds_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(
   return _mm_hadds_pi16(a, b);
 }
+constexpr bool test_mm_hadds_pi16_constexpr() {
+    constexpr __m64 a = _mm_setr_pi16(32767, 32767, 32767, 32767);
+    constexpr __m64 b = _mm_setr_pi16(1,1,1,1);
+    
+    constexpr __m64 result = _mm_hadds_pi16(a, b);
+    return match_v4si(result,32767, 32767, 32767, 32767);
+}
+TEST_CONSTEXPR(test_mm_hadds_pi16_constexpr())
 
 __m64 test_mm_hsub_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(
   return _mm_hsub_pi16(a, b);
 }
+constexpr bool test_mm_hsub_pi16_constexpr() {
+    constexpr __m64 a = _mm_setr_pi16(1, 2, 3, 4);
+    constexpr __m64 b = _mm_setr_pi16(5,6,7,8);
+    
+    constexpr __m64 result = _mm_hsub_pi16(a, b);
+    return match_v4si(result,1-2,3-4,5-6,7-8);
+}
+TEST_CONSTEXPR(test_mm_hsub_pi16_constexpr())
 
 __m64 test_mm_hsub_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(
   return _mm_hsub_pi32(a, b);
 }
+constexpr bool test_mm_hsub_pi32_constexpr() {
+    constexpr __m64 a = _mm_setr_pi32(1, 2);
+    constexpr __m64 b = _mm_setr_pi32(3, 4);
+    
+    constexpr __m64 result = _mm_hsub_pi32(a, b);
+    return match_v2si(result,1-2,3-4);
+}
+TEST_CONSTEXPR(test_mm_hsub_pi32_constexpr())
 
 __m64 test_mm_hsubs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsubs_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(
   return _mm_hsubs_pi16(a, b);
 }
+constexpr bool test_mm_hsubs_pi16_constexpr() {
+    constexpr __m64 a = _mm_setr_pi16(32767, 32767, 32767, 32767);
+    constexpr __m64 b = _mm_setr_pi16(-1,-1,-1,-1);
+    
+    constexpr __m64 result = _mm_hsubs_pi16(a, b);
+    return match_v4si(result,32767, 32767, 32767, 32767);
+}
+TEST_CONSTEXPR(test_mm_hsubs_pi16_constexpr())
 
 __m64 test_mm_insert_pi16(__m64 a, int d) {
   // CHECK-LABEL: test_mm_insert_pi16
diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c 
b/clang/test/CodeGen/X86/ssse3-builtins.c
index 56ff73f08ab32..bd0ef43278217 100644
--- a/clang/test/CodeGen/X86/ssse3-builtins.c
+++ b/clang/test/CodeGen/X86/ssse3-builtins.c
@@ -60,36 +60,85 @@ __m128i test_mm_hadd_epi16(__m128i a, __m128i b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %{{.*}}, <8 x 
i16> %{{.*}})
   return _mm_hadd_epi16(a, b);
 }
+constexpr bool test_mm_hadd_epi16_constexpr() {
+    constexpr __m128i a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+    constexpr __m128i b = _mm_setr_epi16(17,18,19,20,21,22,23,24);
+    
+    constexpr __m128i result = _mm_hadd_epi16(a, b);
+    return match_v8si(result,1+2,3+4,5+6,7+8,17+18,19+20,21+22,23+24);
+}
+TEST_CONSTEXPR(test_mm_hadd_epi16_constexpr())
 
 __m128i test_mm_hadd_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hadd_epi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %{{.*}}, <4 x 
i32> %{{.*}})
   return _mm_hadd_epi32(a, b);
 }
+constexpr bool test_mm_hadd_epi32_constexpr() {
+    constexpr __m128i a = _mm_setr_epi32(1, 2, 3, 4);
+    constexpr __m128i b = _mm_setr_epi32(5,6,7,8);
+    
+    constexpr __m128i result = _mm_hadd_epi32(a, b);
+    return match_v4si(result,1+2,3+4,5+6,7+8);
+}
+TEST_CONSTEXPR(test_mm_hadd_epi32_constexpr())
 
 __m128i test_mm_hadds_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hadds_epi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %{{.*}}, <8 
x i16> %{{.*}})
   return _mm_hadds_epi16(a, b);
 }
+constexpr bool test_mm_hadds_epi16_constexpr() {
+    constexpr __m128i a = _mm_setr_epi16(32767, 32767, 32767, 32767, 32767, 
32767, 32767, 32767);
+    constexpr __m128i b = _mm_setr_epi16(1, 1, 1, 1, 1, 1, 1, 1);
+    constexpr __m128i result = _mm_hadds_epi16(a, b);
+
+    return match_v8si(result, 32767, 32767, 32767, 32767, 32767, 32767, 32767, 
32767);
+}
+TEST_CONSTEXPR(test_mm_hadds_epi16_constexpr())
+
 
 __m128i test_mm_hsub_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsub_epi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %{{.*}}, <8 x 
i16> %{{.*}})
   return _mm_hsub_epi16(a, b);
 }
+constexpr bool test_mm_hsub_epi16_constexpr() {
+    constexpr __m128i a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+    constexpr __m128i b = _mm_setr_epi16(9,10,11,12,13,14,15,16);
+    
+    constexpr __m128i result = _mm_hsub_epi16(a, b);
+    return match_v8si(result,1-2,3-4,5-6,7-8,9-10,11-12,13-14,15-16);
+}
+TEST_CONSTEXPR(test_mm_hsub_epi16_constexpr())
 
 __m128i test_mm_hsub_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsub_epi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %{{.*}}, <4 x 
i32> %{{.*}})
   return _mm_hsub_epi32(a, b);
 }
+constexpr bool test_mm_hsub_epi32_constexpr() {
+    constexpr __m128i a = _mm_setr_epi32(1, 2, 3, 4);
+    constexpr __m128i b = _mm_setr_epi32(5,6,7,8);
+    
+    constexpr __m128i result = _mm_hsub_epi32(a, b);
+    return match_v4si(result,1-2,3-4,5-6,7-8);
+}
+TEST_CONSTEXPR(test_mm_hsub_epi32_constexpr())
 
 __m128i test_mm_hsubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsubs_epi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %{{.*}}, <8 
x i16> %{{.*}})
   return _mm_hsubs_epi16(a, b);
 }
+constexpr bool test_mm_hsubs_epi16_constexpr() {
+    constexpr __m128i a = _mm_setr_epi16(32767, 32767, 32767, 32767, 32767, 
32767, 32767, 32767);
+    constexpr __m128i b = _mm_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1);
+    constexpr __m128i result3 = _mm_hsubs_epi16(a, b);
+
+    return match_v8si(result3, 32767, 32767, 32767, 32767, 32767, 32767, 
32767, 32767);
+}
+TEST_CONSTEXPR(test_mm_hsubs_epi16_constexpr())
 
 __m128i test_mm_maddubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_maddubs_epi16

>From 2fadf3fd261935e25adff5b26ad8ee0734746a26 Mon Sep 17 00:00:00 2001
From: whyuuwang <[email protected]>
Date: Thu, 4 Sep 2025 15:55:44 +0800
Subject: [PATCH 2/2] deal issues 15595 [Clang]
 VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - add MMX/SSE/AVX
 PHADD/SUB & HADDPS/D intrinsics to be used in constexpr #155395

---
 clang/lib/Headers/avx2intrin.h | 15 ++++-----
 clang/lib/Headers/avxintrin.h  | 15 ++++-----
 clang/lib/Headers/pmmintrin.h  |  4 +--
 clang/lib/Headers/tmmintrin.h  | 57 +++++++++++++++-------------------
 4 files changed, 39 insertions(+), 52 deletions(-)

diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index f8fb808f7f29c..c39f94c7fc16b 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -886,9 +886,8 @@ _mm256_hadd_epi16(__m256i __a, __m256i __b) {
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the sums.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_hadd_epi32(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
+_mm256_hadd_epi32(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
 }
 
 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
@@ -987,9 +986,8 @@ _mm256_hsub_epi16(__m256i __a, __m256i __b) {
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the differences.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_hsub_epi32(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
+_mm256_hsub_epi32(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
 }
 
 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
@@ -1023,9 +1021,8 @@ _mm256_hsub_epi32(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the differences.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_hsubs_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
+_mm256_hsubs_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index 976710a64e80e..48d79063f9b61 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -704,8 +704,7 @@ _mm256_xor_ps(__m256 __a, __m256 __b)
 /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
 ///    both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm256_hadd_pd(__m256d __a, __m256d __b)
-{
+_mm256_hadd_pd(__m256d __a, __m256d __b) {
   return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
 }
 
@@ -726,8 +725,8 @@ _mm256_hadd_pd(__m256d __a, __m256d __b)
 ///    index 2, 3, 6, 7 of a vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
 ///    both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR 
-_mm256_hadd_ps(__m256 __a, __m256 __b) {
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_ps(__m256 __a,
+                                                                   __m256 __b) 
{
   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
 }
 
@@ -749,8 +748,7 @@ _mm256_hadd_ps(__m256 __a, __m256 __b) {
 /// \returns A 256-bit vector of [4 x double] containing the horizontal
 ///    differences of both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm256_hsub_pd(__m256d __a, __m256d __b)
-{
+_mm256_hsub_pd(__m256d __a, __m256d __b) {
   return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
 }
 
@@ -771,9 +769,8 @@ _mm256_hsub_pd(__m256d __a, __m256d __b)
 ///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the horizontal
 ///    differences of both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm256_hsub_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_ps(__m256 __a,
+                                                                   __m256 __b) 
{
   return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
 }
 
diff --git a/clang/lib/Headers/pmmintrin.h b/clang/lib/Headers/pmmintrin.h
index 400b28bb877a1..67f2a7ffd1f56 100644
--- a/clang/lib/Headers/pmmintrin.h
+++ b/clang/lib/Headers/pmmintrin.h
@@ -89,8 +89,8 @@ _mm_addsub_ps(__m128 __a, __m128 __b)
 ///    destination.
 /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
 ///    both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR 
-_mm_hadd_ps(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_ps(__m128 __a,
+                                                                  __m128 __b) {
   return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
 }
 
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index d79f7f6ea4091..b408c6a3404ec 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -206,8 +206,7 @@ _mm_abs_epi32(__m128i __a) {
 ///    both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_hadd_epi16(__m128i __a, __m128i __b) {
-  return (__m128i)__builtin_ia32_phaddw128(
-      (__v8hi)__a, (__v8hi)__b);
+  return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -249,8 +248,8 @@ _mm_hadd_epi32(__m128i __a, __m128i __b) {
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of 
both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
- _mm_hadd_pi16(__m64 __a, __m64 __b) {
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_pi16(__m64 __a,
+                                                                   __m64 __b) {
   return __trunc64(__builtin_ia32_phaddw128(
       (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
@@ -272,11 +271,10 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
 ///    destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of 
both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_hadd_pi32(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phaddd128(
-        (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_pi32(__m64 __a,
+                                                                   __m64 __b) {
+  return __trunc64(__builtin_ia32_phaddd128(
+      (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
 }
 
 /// Horizontally adds, with saturation, the adjacent pairs of values contained
@@ -324,11 +322,10 @@ _mm_hadds_epi16(__m128i __a, __m128i __b) {
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_hadds_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phaddsw128(
-        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadds_pi16(__m64 __a,
+                                                                    __m64 __b) 
{
+  return __trunc64(__builtin_ia32_phaddsw128(
+      (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -392,11 +389,10 @@ _mm_hsub_epi32(__m128i __a, __m128i __b) {
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_hsub_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phsubw128(
-        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsub_pi16(__m64 __a,
+                                                                   __m64 __b) {
+  return __trunc64(__builtin_ia32_phsubw128(
+      (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -416,11 +412,10 @@ _mm_hsub_pi16(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_hsub_pi32(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phsubd128(
-        (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsub_pi32(__m64 __a,
+                                                                   __m64 __b) {
+  return __trunc64(__builtin_ia32_phsubd128(
+      (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
 }
 
 /// Horizontally subtracts, with saturation, the adjacent pairs of values
@@ -468,11 +463,10 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b) {
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_hsubs_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phsubsw128(
-        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsubs_pi16(__m64 __a,
+                                                                    __m64 __b) 
{
+  return __trunc64(__builtin_ia32_phsubsw128(
+      (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -553,9 +547,8 @@ _mm_maddubs_pi16(__m64 __a, __m64 __b) {
 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
 ///    products of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_mulhrs_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
+_mm_mulhrs_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] add MMX/SSE/AVX PHADD/SUB & HADDPS/D intrinsics to be used in constexpr #155395 (PR #156822)

Reply via email to