https://github.com/RKSimon created 
https://github.com/llvm/llvm-project/pull/154731

Now that #152455 is done, we can make all the scalar fma intrinsics to wrap 
__builtin_elementwise_fma, which also allows constexpr

The main difference is that FMA4 intrinsics guarantee that the upper elements 
are zero, while FMA3 passes through the destination register elements like 
older scalar instructions

>From e15a719220dad7f394ef4d51bd530dc247fbda6b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-...@redking.me.uk>
Date: Wed, 20 Aug 2025 17:41:15 +0100
Subject: [PATCH] [Headers][X86] Update FMA3/FMA4 scalar intrinsics to use
 __builtin_elementwise_fma and support constexpr

Now that #152455 is done, we can make all the scalar fma intrinsics to wrap 
__builtin_elementwise_fma, which also allows constexpr

The main difference is that FMA4 intrinsics guarantee that the upper elements 
are zero, while FMA3 passes through the destination register elements like 
older scalar instructions
---
 clang/include/clang/Basic/BuiltinsX86.td |  10 --
 clang/lib/CodeGen/TargetBuiltins/X86.cpp |   6 --
 clang/lib/Headers/fma4intrin.h           |  32 +++----
 clang/lib/Headers/fmaintrin.h            |  40 ++++----
 clang/test/CodeGen/X86/fma-builtins.c    | 100 ++++++++++---------
 clang/test/CodeGen/X86/fma4-builtins.c   | 116 ++++++++++++++---------
 6 files changed, 164 insertions(+), 140 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td 
b/clang/include/clang/Basic/BuiltinsX86.td
index cc1da937455a2..1dd4a122c9500 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -885,16 +885,6 @@ let Features = "sha", Attributes = [NoThrow, Const, 
RequiredVectorWidth<128>] in
   def sha256msg2 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, 
int>)">;
 }
 
-let Features = "fma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] 
in {
-  def vfmaddss3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, 
float>, _Vector<4, float>)">;
-  def vfmaddsd3 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, 
_Vector<2, double>, _Vector<2, double>)">;
-}
-
-let Features = "fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] 
in {
-  def vfmaddss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, 
float>, _Vector<4, float>)">;
-  def vfmaddsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, 
double>, _Vector<2, double>)">;
-}
-
 let Features = "fma|fma4", Attributes = [NoThrow, Const, 
RequiredVectorWidth<128>] in {
   def vfmaddsubps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, 
_Vector<4, float>, _Vector<4, float>)">;
   def vfmaddsubpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, 
_Vector<2, double>, _Vector<2, double>)">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp 
b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index b9248a7d43f85..d8609c3e00f4f 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -1028,16 +1028,10 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned 
BuiltinID,
   case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
     return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ false);
 
-  case X86::BI__builtin_ia32_vfmaddss3:
-  case X86::BI__builtin_ia32_vfmaddsd3:
   case X86::BI__builtin_ia32_vfmaddsh3_mask:
   case X86::BI__builtin_ia32_vfmaddss3_mask:
   case X86::BI__builtin_ia32_vfmaddsd3_mask:
     return EmitScalarFMAExpr(*this, E, Ops, Ops[0]);
-  case X86::BI__builtin_ia32_vfmaddss:
-  case X86::BI__builtin_ia32_vfmaddsd:
-    return EmitScalarFMAExpr(*this, E, Ops,
-                             Constant::getNullValue(Ops[0]->getType()));
   case X86::BI__builtin_ia32_vfmaddsh3_maskz:
   case X86::BI__builtin_ia32_vfmaddss3_maskz:
   case X86::BI__builtin_ia32_vfmaddsd3_maskz:
diff --git a/clang/lib/Headers/fma4intrin.h b/clang/lib/Headers/fma4intrin.h
index e0a0e4c968950..426be2d3c3658 100644
--- a/clang/lib/Headers/fma4intrin.h
+++ b/clang/lib/Headers/fma4intrin.h
@@ -40,16 +40,16 @@ _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) {
                                             (__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, 
(__v4sf)__C);
+  return _mm_set_ss(__builtin_elementwise_fma(__A[0], __B[0], __C[0]));
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, 
(__v2df)__C);
+  return _mm_set_sd(__builtin_elementwise_fma(__A[0], __B[0], __C[0]));
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -64,16 +64,16 @@ _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) {
                                             -(__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, 
-(__v4sf)__C);
+  return _mm_set_ss(__builtin_elementwise_fma(__A[0], __B[0], -__C[0]));
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, 
-(__v2df)__C);
+  return _mm_set_sd(__builtin_elementwise_fma(__A[0], __B[0], -__C[0]));
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -88,16 +88,16 @@ _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) {
                                             (__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, 
(__v4sf)__C);
+  return _mm_set_ss(__builtin_elementwise_fma(-__A[0], __B[0], __C[0]));
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, 
(__v2df)__C);
+  return _mm_set_sd(__builtin_elementwise_fma(-__A[0], __B[0], __C[0]));
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -112,16 +112,16 @@ _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) {
                                             -(__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, 
-(__v4sf)__C);
+  return _mm_set_ss(__builtin_elementwise_fma(-__A[0], __B[0], -__C[0]));
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, 
-(__v2df)__C);
+  return _mm_set_sd(__builtin_elementwise_fma(-__A[0], __B[0], -__C[0]));
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
diff --git a/clang/lib/Headers/fmaintrin.h b/clang/lib/Headers/fmaintrin.h
index d8ea489022b8f..2aae620e04fb9 100644
--- a/clang/lib/Headers/fmaintrin.h
+++ b/clang/lib/Headers/fmaintrin.h
@@ -91,10 +91,11 @@ _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 ///    32 bits.
 /// \returns A 128-bit vector of [4 x float] containing the result in the low
 ///    32 bits and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, 
(__v4sf)__C);
+  __A[0] = __builtin_elementwise_fma(__A[0], __B[0], __C[0]);
+  return __A;
 }
 
 /// Computes a scalar multiply-add of the double-precision values in the
@@ -120,10 +121,11 @@ _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 ///    64 bits.
 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 ///    64 bits and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, 
(__v2df)__C);
+  __A[0] = __builtin_elementwise_fma(__A[0], __B[0], __C[0]);
+  return __A;
 }
 
 /// Computes a multiply-subtract of 128-bit vectors of [4 x float].
@@ -191,10 +193,11 @@ _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 ///   32 bits.
 /// \returns A 128-bit vector of [4 x float] containing the result in the low
 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, 
-(__v4sf)__C);
+  __A[0] = __builtin_elementwise_fma(__A[0], __B[0], -__C[0]);
+  return __A;
 }
 
 /// Computes a scalar multiply-subtract of the double-precision values in
@@ -220,10 +223,11 @@ _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 ///    64 bits.
 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, 
-(__v2df)__C);
+  __A[0] = __builtin_elementwise_fma(__A[0], __B[0], -__C[0]);
+  return __A;
 }
 
 /// Computes a negated multiply-add of 128-bit vectors of [4 x float].
@@ -291,10 +295,11 @@ _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 ///    32 bits.
 /// \returns A 128-bit vector of [4 x float] containing the result in the low
 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, 
(__v4sf)__C);
+  __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], __C[0]);
+  return __A;
 }
 
 /// Computes a scalar negated multiply-add of the double-precision values
@@ -320,10 +325,11 @@ _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 ///    64 bits.
 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, 
(__v2df)__C);
+  __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], __C[0]);
+  return __A;
 }
 
 /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
@@ -391,10 +397,11 @@ _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 ///    32 bits.
 /// \returns A 128-bit vector of [4 x float] containing the result in the low
 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, 
-(__v4sf)__C);
+  __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], -__C[0]);
+  return __A;
 }
 
 /// Computes a scalar negated multiply-subtract of the double-precision
@@ -420,10 +427,11 @@ _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 ///    64 bits.
 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, 
-(__v2df)__C);
+  __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], -__C[0]);
+  return __A;
 }
 
 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
diff --git a/clang/test/CodeGen/X86/fma-builtins.c 
b/clang/test/CodeGen/X86/fma-builtins.c
index 8e9822ec6ad2f..5601980166697 100644
--- a/clang/test/CodeGen/X86/fma-builtins.c
+++ b/clang/test/CodeGen/X86/fma-builtins.c
@@ -23,23 +23,25 @@ TEST_CONSTEXPR(match_m128d(_mm_fmadd_pd((__m128d){ 0.0, 
-4.0 }, (__m128d){ -0.0,
 
 __m128 test_mm_fmadd_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fmadd_ss
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
   // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float 
%{{.*}})
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_fmadd_ss(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_fmadd_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, 
(__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 
-7.0f, 1.0f, -2.0f, -0.0f));
 
 __m128d test_mm_fmadd_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fmadd_sd
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
   // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double 
%{{.*}})
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_fmadd_sd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_fmadd_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 
1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), -12.0, 1.0));
 
 __m128 test_mm_fmsub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fmsub_ps
@@ -59,25 +61,27 @@ TEST_CONSTEXPR(match_m128d(_mm_fmsub_pd((__m128d){ 0.0, 
-4.0 }, (__m128d){ -0.0,
 
 __m128 test_mm_fmsub_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fmsub_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float 
%{{.*}})
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg float %{{.+}}
+  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float 
[[NEG]])
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_fmsub_ss(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_fmsub_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, 
(__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 
-9.0f, 1.0f, -2.0f, -0.0f));
 
 __m128d test_mm_fmsub_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fmsub_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double 
%{{.*}})
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg double %{{.+}}
+  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double 
[[NEG]])
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_fmsub_sd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_fmsub_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 
1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), 4.0, 1.0));
 
 __m128 test_mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fnmadd_ps
@@ -97,25 +101,27 @@ TEST_CONSTEXPR(match_m128d(_mm_fnmadd_pd((__m128d){ 0.0, 
-4.0 }, (__m128d){ -0.0
 
 __m128 test_mm_fnmadd_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fnmadd_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float 
%{{.*}})
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg float %{{.+}}
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float [[NEG]], float 
%{{.*}})
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_fnmadd_ss(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_fnmadd_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, 
(__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 
9.0f, 1.0f, -2.0f, -0.0f));
 
 __m128d test_mm_fnmadd_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fnmadd_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double 
%{{.*}})
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg double %{{.+}}
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double [[NEG]], double 
%{{.*}})
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_fnmadd_sd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_fnmadd_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 
1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), -4.0, 1.0));
 
 __m128 test_mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fnmsub_ps
@@ -137,27 +143,29 @@ TEST_CONSTEXPR(match_m128d(_mm_fnmsub_pd((__m128d){ 0.0, 
-4.0 }, (__m128d){ -0.0
 
 __m128 test_mm_fnmsub_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fnmsub_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
-  // CHECK: [[NEG2:%.+]] = fneg <4 x float> %{{.+}}
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float 
%{{.*}})
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg float %{{.+}}
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: [[NEG2:%.+]] = fneg float %{{.+}}
+  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float [[NEG]], float 
[[NEG2]])
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_fnmsub_ss(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_fnmsub_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, 
(__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 
7.0f, 1.0f, -2.0f, -0.0f));
 
 __m128d test_mm_fnmsub_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_fnmsub_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
-  // CHECK: [[NEG2:%.+]] = fneg <2 x double> %{{.+}}
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double 
%{{.*}})
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg double %{{.+}}
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: [[NEG2:%.+]] = fneg double %{{.+}}
+  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double [[NEG]], double 
[[NEG2]])
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_fnmsub_sd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_fnmsub_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 
1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), 12.0, 1.0));
 
 __m128 test_mm_fmaddsub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_fmaddsub_ps
diff --git a/clang/test/CodeGen/X86/fma4-builtins.c 
b/clang/test/CodeGen/X86/fma4-builtins.c
index dcfd48a220e11..401b70726a63b 100644
--- a/clang/test/CodeGen/X86/fma4-builtins.c
+++ b/clang/test/CodeGen/X86/fma4-builtins.c
@@ -23,23 +23,29 @@ TEST_CONSTEXPR(match_m128d(_mm_macc_pd((__m128d){ 0.0, -4.0 
}, (__m128d){ -0.0,
 
 __m128 test_mm_macc_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_macc_ss
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
   // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float 
%{{.*}})
-  // CHECK: insertelement <4 x float> zeroinitializer, float %{{.*}}, i64 0
+  // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 1
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 2
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 3
   return _mm_macc_ss(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_macc_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, 
(__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 
-7.0f, 0.0f, 0.0f, 0.0f));
 
 __m128d test_mm_macc_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_macc_sd
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
   // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double 
%{{.*}})
-  // CHECK: insertelement <2 x double> zeroinitializer, double %{{.*}}, i64 0
+  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double 0.000000e+00, i32 1
   return _mm_macc_sd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_macc_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 
2.0 }, (__m128d){ -8.0, 3.0 }), -12.0, 0.0));
 
 __m128 test_mm_msub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_msub_ps
@@ -59,25 +65,31 @@ TEST_CONSTEXPR(match_m128d(_mm_msub_pd((__m128d){ 0.0, -4.0 
}, (__m128d){ -0.0,
 
 __m128 test_mm_msub_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_msub_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: [[C:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float [[C]])
-  // CHECK: insertelement <4 x float> zeroinitializer, float %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg float %{{.+}}
+  // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float 
[[NEG]])
+  // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 1
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 2
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 3
   return _mm_msub_ss(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_msub_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, 
(__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 
-9.0f, 0.0f, 0.0f, 0.0f));
 
 __m128d test_mm_msub_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_msub_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: [[C:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double 
[[C]])
-  // CHECK: insertelement <2 x double> zeroinitializer, double %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg double %{{.+}}
+  // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double 
[[NEG]])
+  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double 0.000000e+00, i32 1
   return _mm_msub_sd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_msub_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 
2.0 }, (__m128d){ -8.0, 3.0 }), 4.0, 0.0));
 
 __m128 test_mm_nmacc_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_nmacc_ps
@@ -97,25 +109,31 @@ TEST_CONSTEXPR(match_m128d(_mm_nmacc_pd((__m128d){ 0.0, 
-4.0 }, (__m128d){ -0.0,
 
 __m128 test_mm_nmacc_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_nmacc_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: call float @llvm.fma.f32(float [[A]], float %{{.*}}, float %{{.*}})
-  // CHECK: insertelement <4 x float> zeroinitializer, float %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg float %{{.+}}
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: call float @llvm.fma.f32(float [[NEG]], float %{{.*}}, float 
%{{.*}})
+  // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 1
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 2
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 3
   return _mm_nmacc_ss(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_nmacc_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, 
(__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 
9.0f, 0.0f, 0.0f, 0.0f));
 
 __m128d test_mm_nmacc_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_nmacc_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: call double @llvm.fma.f64(double [[A]], double %{{.*}}, double 
%{{.*}})
-  // CHECK: insertelement <2 x double> zeroinitializer, double %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg double %{{.+}}
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: call double @llvm.fma.f64(double [[NEG]], double %{{.*}}, double 
%{{.*}})
+  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double 0.000000e+00, i32 1
   return _mm_nmacc_sd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_nmacc_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 
1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), -4.0, 0.0));
 
 __m128 test_mm_nmsub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_nmsub_ps
@@ -137,27 +155,33 @@ TEST_CONSTEXPR(match_m128d(_mm_nmsub_pd((__m128d){ 0.0, 
-4.0 }, (__m128d){ -0.0,
 
 __m128 test_mm_nmsub_ss(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_nmsub_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
-  // CHECK: [[NEG2:%.+]] = fneg <4 x float> %{{.+}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK: [[C:%.+]] = extractelement <4 x float> [[NEG2]], i64 0
-  // CHECK: call float @llvm.fma.f32(float [[A]], float %{{.*}}, float [[C]])
-  // CHECK: insertelement <4 x float> zeroinitializer, float %{{.*}}, i64 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg float %{{.+}}
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: [[NEG2:%.+]] = fneg float %{{.+}}
+  // CHECK: call float @llvm.fma.f32(float [[NEG]], float %{{.*}}, float 
[[NEG2]])
+  // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 1
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 2
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 3
   return _mm_nmsub_ss(a, b, c);
 }
+TEST_CONSTEXPR(match_m128(_mm_nmsub_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, 
(__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 
7.0f, 0.0f, 0.0f, 0.0f));
 
 __m128d test_mm_nmsub_sd(__m128d a, __m128d b, __m128d c) {
   // CHECK-LABEL: test_mm_nmsub_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
-  // CHECK: [[NEG2:%.+]] = fneg <2 x double> %{{.+}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: [[C:%.+]] = extractelement <2 x double> [[NEG2]], i64 0
-  // CHECK: call double @llvm.fma.f64(double [[A]], double %{{.*}}, double 
[[C]])
-  // CHECK: insertelement <2 x double> zeroinitializer, double %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: [[NEG:%.+]] = fneg double %{{.+}}
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: [[NEG2:%.+]] = fneg double %{{.+}}
+  // CHECK: call double @llvm.fma.f64(double [[NEG]], double %{{.*}}, double 
[[NEG2]])
+  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double 0.000000e+00, i32 1
   return _mm_nmsub_sd(a, b, c);
 }
+TEST_CONSTEXPR(match_m128d(_mm_nmsub_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 
1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), 12.0, 0.0));
 
 __m128 test_mm_maddsub_ps(__m128 a, __m128 b, __m128 c) {
   // CHECK-LABEL: test_mm_maddsub_ps

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to