https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/154731
Now that #152455 is done, we can make all the scalar fma intrinsics to wrap __builtin_elementwise_fma, which also allows constexpr The main difference is that FMA4 intrinsics guarantee that the upper elements are zero, while FMA3 passes through the destination register elements like older scalar instructions >From e15a719220dad7f394ef4d51bd530dc247fbda6b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim <llvm-...@redking.me.uk> Date: Wed, 20 Aug 2025 17:41:15 +0100 Subject: [PATCH] [Headers][X86] Update FMA3/FMA4 scalar intrinsics to use __builtin_elementwise_fma and support constexpr Now that #152455 is done, we can make all the scalar fma intrinsics to wrap __builtin_elementwise_fma, which also allows constexpr The main difference is that FMA4 intrinsics guarantee that the upper elements are zero, while FMA3 passes through the destination register elements like older scalar instructions --- clang/include/clang/Basic/BuiltinsX86.td | 10 -- clang/lib/CodeGen/TargetBuiltins/X86.cpp | 6 -- clang/lib/Headers/fma4intrin.h | 32 +++---- clang/lib/Headers/fmaintrin.h | 40 ++++---- clang/test/CodeGen/X86/fma-builtins.c | 100 ++++++++++--------- clang/test/CodeGen/X86/fma4-builtins.c | 116 ++++++++++++++--------- 6 files changed, 164 insertions(+), 140 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index cc1da937455a2..1dd4a122c9500 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -885,16 +885,6 @@ let Features = "sha", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in def sha256msg2 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">; } -let Features = "fma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vfmaddss3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">; - def vfmaddsd3 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">; -} - -let Features = "fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vfmaddss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">; - def vfmaddsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">; -} - let Features = "fma|fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { def vfmaddsubps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">; def vfmaddsubpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">; diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index b9248a7d43f85..d8609c3e00f4f 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -1028,16 +1028,10 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_vcvtuqq2ph512_mask: return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ false); - case X86::BI__builtin_ia32_vfmaddss3: - case X86::BI__builtin_ia32_vfmaddsd3: case X86::BI__builtin_ia32_vfmaddsh3_mask: case X86::BI__builtin_ia32_vfmaddss3_mask: case X86::BI__builtin_ia32_vfmaddsd3_mask: return EmitScalarFMAExpr(*this, E, Ops, Ops[0]); - case X86::BI__builtin_ia32_vfmaddss: - case X86::BI__builtin_ia32_vfmaddsd: - return EmitScalarFMAExpr(*this, E, Ops, - Constant::getNullValue(Ops[0]->getType())); case X86::BI__builtin_ia32_vfmaddsh3_maskz: case X86::BI__builtin_ia32_vfmaddss3_maskz: case X86::BI__builtin_ia32_vfmaddsd3_maskz: diff --git a/clang/lib/Headers/fma4intrin.h b/clang/lib/Headers/fma4intrin.h index e0a0e4c968950..426be2d3c3658 100644 --- a/clang/lib/Headers/fma4intrin.h +++ b/clang/lib/Headers/fma4intrin.h @@ -40,16 +40,16 @@ _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) { (__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_macc_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return _mm_set_ss(__builtin_elementwise_fma(__A[0], __B[0], __C[0])); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return _mm_set_sd(__builtin_elementwise_fma(__A[0], __B[0], __C[0])); } static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR @@ -64,16 +64,16 @@ _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) { -(__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_msub_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); + return _mm_set_ss(__builtin_elementwise_fma(__A[0], __B[0], -__C[0])); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C); + return _mm_set_sd(__builtin_elementwise_fma(__A[0], __B[0], -__C[0])); } static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR @@ -88,16 +88,16 @@ _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) { (__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return _mm_set_ss(__builtin_elementwise_fma(-__A[0], __B[0], __C[0])); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); + return _mm_set_sd(__builtin_elementwise_fma(-__A[0], __B[0], __C[0])); } static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR @@ -112,16 +112,16 @@ _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) { -(__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); + return _mm_set_ss(__builtin_elementwise_fma(-__A[0], __B[0], -__C[0])); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); + return _mm_set_sd(__builtin_elementwise_fma(-__A[0], __B[0], -__C[0])); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 diff --git a/clang/lib/Headers/fmaintrin.h b/clang/lib/Headers/fmaintrin.h index d8ea489022b8f..2aae620e04fb9 100644 --- a/clang/lib/Headers/fmaintrin.h +++ b/clang/lib/Headers/fmaintrin.h @@ -91,10 +91,11 @@ _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) /// 32 bits. /// \returns A 128-bit vector of [4 x float] containing the result in the low /// 32 bits and a copy of \a __A[127:32] in the upper 96 bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + __A[0] = __builtin_elementwise_fma(__A[0], __B[0], __C[0]); + return __A; } /// Computes a scalar multiply-add of the double-precision values in the @@ -120,10 +121,11 @@ _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) /// 64 bits. /// \returns A 128-bit vector of [2 x double] containing the result in the low /// 64 bits and a copy of \a __A[127:64] in the upper 64 bits. -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C); + __A[0] = __builtin_elementwise_fma(__A[0], __B[0], __C[0]); + return __A; } /// Computes a multiply-subtract of 128-bit vectors of [4 x float]. @@ -191,10 +193,11 @@ _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) /// 32 bits. /// \returns A 128-bit vector of [4 x float] containing the result in the low /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); + __A[0] = __builtin_elementwise_fma(__A[0], __B[0], -__C[0]); + return __A; } /// Computes a scalar multiply-subtract of the double-precision values in @@ -220,10 +223,11 @@ _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) /// 64 bits. /// \returns A 128-bit vector of [2 x double] containing the result in the low /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits. -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C); + __A[0] = __builtin_elementwise_fma(__A[0], __B[0], -__C[0]); + return __A; } /// Computes a negated multiply-add of 128-bit vectors of [4 x float]. @@ -291,10 +295,11 @@ _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) /// 32 bits. /// \returns A 128-bit vector of [4 x float] containing the result in the low /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C); + __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], __C[0]); + return __A; } /// Computes a scalar negated multiply-add of the double-precision values @@ -320,10 +325,11 @@ _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) /// 64 bits. /// \returns A 128-bit vector of [2 x double] containing the result in the low /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits. -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C); + __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], __C[0]); + return __A; } /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float]. @@ -391,10 +397,11 @@ _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) /// 32 bits. /// \returns A 128-bit vector of [4 x float] containing the result in the low /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C); + __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], -__C[0]); + return __A; } /// Computes a scalar negated multiply-subtract of the double-precision @@ -420,10 +427,11 @@ _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) /// 64 bits. /// \returns A 128-bit vector of [2 x double] containing the result in the low /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits. -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C); + __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], -__C[0]); + return __A; } /// Computes a multiply with alternating add/subtract of 128-bit vectors of diff --git a/clang/test/CodeGen/X86/fma-builtins.c b/clang/test/CodeGen/X86/fma-builtins.c index 8e9822ec6ad2f..5601980166697 100644 --- a/clang/test/CodeGen/X86/fma-builtins.c +++ b/clang/test/CodeGen/X86/fma-builtins.c @@ -23,23 +23,25 @@ TEST_CONSTEXPR(match_m128d(_mm_fmadd_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, __m128 test_mm_fmadd_ss(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fmadd_ss - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) - // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 return _mm_fmadd_ss(a, b, c); } +TEST_CONSTEXPR(match_m128(_mm_fmadd_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), -7.0f, 1.0f, -2.0f, -0.0f)); __m128d test_mm_fmadd_sd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_fmadd_sd - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) - // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 return _mm_fmadd_sd(a, b, c); } +TEST_CONSTEXPR(match_m128d(_mm_fmadd_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), -12.0, 1.0)); __m128 test_mm_fmsub_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fmsub_ps @@ -59,25 +61,27 @@ TEST_CONSTEXPR(match_m128d(_mm_fmsub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, __m128 test_mm_fmsub_ss(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fmsub_ss - // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}} - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) - // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: [[NEG:%.+]] = fneg float %{{.+}} + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float [[NEG]]) + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 return _mm_fmsub_ss(a, b, c); } +TEST_CONSTEXPR(match_m128(_mm_fmsub_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), -9.0f, 1.0f, -2.0f, -0.0f)); __m128d test_mm_fmsub_sd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_fmsub_sd - // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}} - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) - // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: [[NEG:%.+]] = fneg double %{{.+}} + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double [[NEG]]) + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 return _mm_fmsub_sd(a, b, c); } +TEST_CONSTEXPR(match_m128d(_mm_fmsub_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), 4.0, 1.0)); __m128 test_mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fnmadd_ps @@ -97,25 +101,27 @@ TEST_CONSTEXPR(match_m128d(_mm_fnmadd_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0 __m128 test_mm_fnmadd_ss(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fnmadd_ss - // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}} - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) - // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: [[NEG:%.+]] = fneg float %{{.+}} + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float [[NEG]], float %{{.*}}) + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 return _mm_fnmadd_ss(a, b, c); } +TEST_CONSTEXPR(match_m128(_mm_fnmadd_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 9.0f, 1.0f, -2.0f, -0.0f)); __m128d test_mm_fnmadd_sd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_fnmadd_sd - // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}} - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) - // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: [[NEG:%.+]] = fneg double %{{.+}} + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double [[NEG]], double %{{.*}}) + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 return _mm_fnmadd_sd(a, b, c); } +TEST_CONSTEXPR(match_m128d(_mm_fnmadd_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), -4.0, 1.0)); __m128 test_mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fnmsub_ps @@ -137,27 +143,29 @@ TEST_CONSTEXPR(match_m128d(_mm_fnmsub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0 __m128 test_mm_fnmsub_ss(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fnmsub_ss - // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}} - // CHECK: [[NEG2:%.+]] = fneg <4 x float> %{{.+}} - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) - // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: [[NEG:%.+]] = fneg float %{{.+}} + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: [[NEG2:%.+]] = fneg float %{{.+}} + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float [[NEG]], float [[NEG2]]) + // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0 return _mm_fnmsub_ss(a, b, c); } +TEST_CONSTEXPR(match_m128(_mm_fnmsub_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 7.0f, 1.0f, -2.0f, -0.0f)); __m128d test_mm_fnmsub_sd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_fnmsub_sd - // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}} - // CHECK: [[NEG2:%.+]] = fneg <2 x double> %{{.+}} - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) - // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: [[NEG:%.+]] = fneg double %{{.+}} + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: [[NEG2:%.+]] = fneg double %{{.+}} + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double [[NEG]], double [[NEG2]]) + // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0 return _mm_fnmsub_sd(a, b, c); } +TEST_CONSTEXPR(match_m128d(_mm_fnmsub_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), 12.0, 1.0)); __m128 test_mm_fmaddsub_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_fmaddsub_ps diff --git a/clang/test/CodeGen/X86/fma4-builtins.c b/clang/test/CodeGen/X86/fma4-builtins.c index dcfd48a220e11..401b70726a63b 100644 --- a/clang/test/CodeGen/X86/fma4-builtins.c +++ b/clang/test/CodeGen/X86/fma4-builtins.c @@ -23,23 +23,29 @@ TEST_CONSTEXPR(match_m128d(_mm_macc_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, __m128 test_mm_macc_ss(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_macc_ss - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) - // CHECK: insertelement <4 x float> zeroinitializer, float %{{.*}}, i64 0 + // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0 + // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 1 + // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 2 + // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 3 return _mm_macc_ss(a, b, c); } +TEST_CONSTEXPR(match_m128(_mm_macc_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), -7.0f, 0.0f, 0.0f, 0.0f)); __m128d test_mm_macc_sd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_macc_sd - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}}) - // CHECK: insertelement <2 x double> zeroinitializer, double %{{.*}}, i64 0 + // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0 + // CHECK: insertelement <2 x double> %{{.*}}, double 0.000000e+00, i32 1 return _mm_macc_sd(a, b, c); } +TEST_CONSTEXPR(match_m128d(_mm_macc_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), -12.0, 0.0)); __m128 test_mm_msub_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_msub_ps @@ -59,25 +65,31 @@ TEST_CONSTEXPR(match_m128d(_mm_msub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, __m128 test_mm_msub_ss(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_msub_ss - // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}} - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK: [[C:%.+]] = extractelement <4 x float> [[NEG]], i64 0 - // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float [[C]]) - // CHECK: insertelement <4 x float> zeroinitializer, float %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: [[NEG:%.+]] = fneg float %{{.+}} + // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float [[NEG]]) + // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0 + // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 1 + // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 2 + // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 3 return _mm_msub_ss(a, b, c); } +TEST_CONSTEXPR(match_m128(_mm_msub_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), -9.0f, 0.0f, 0.0f, 0.0f)); __m128d test_mm_msub_sd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_msub_sd - // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}} - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK: [[C:%.+]] = extractelement <2 x double> [[NEG]], i64 0 - // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double [[C]]) - // CHECK: insertelement <2 x double> zeroinitializer, double %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: [[NEG:%.+]] = fneg double %{{.+}} + // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double [[NEG]]) + // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0 + // CHECK: insertelement <2 x double> %{{.*}}, double 0.000000e+00, i32 1 return _mm_msub_sd(a, b, c); } +TEST_CONSTEXPR(match_m128d(_mm_msub_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), 4.0, 0.0)); __m128 test_mm_nmacc_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_nmacc_ps @@ -97,25 +109,31 @@ TEST_CONSTEXPR(match_m128d(_mm_nmacc_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, __m128 test_mm_nmacc_ss(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_nmacc_ss - // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}} - // CHECK: [[A:%.+]] = extractelement <4 x float> [[NEG]], i64 0 - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK: call float @llvm.fma.f32(float [[A]], float %{{.*}}, float %{{.*}}) - // CHECK: insertelement <4 x float> zeroinitializer, float %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: [[NEG:%.+]] = fneg float %{{.+}} + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: call float @llvm.fma.f32(float [[NEG]], float %{{.*}}, float %{{.*}}) + // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0 + // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 1 + // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 2 + // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 3 return _mm_nmacc_ss(a, b, c); } +TEST_CONSTEXPR(match_m128(_mm_nmacc_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 9.0f, 0.0f, 0.0f, 0.0f)); __m128d test_mm_nmacc_sd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_nmacc_sd - // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}} - // CHECK: [[A:%.+]] = extractelement <2 x double> [[NEG]], i64 0 - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK: call double @llvm.fma.f64(double [[A]], double %{{.*}}, double %{{.*}}) - // CHECK: insertelement <2 x double> zeroinitializer, double %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: [[NEG:%.+]] = fneg double %{{.+}} + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: call double @llvm.fma.f64(double [[NEG]], double %{{.*}}, double %{{.*}}) + // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0 + // CHECK: insertelement <2 x double> %{{.*}}, double 0.000000e+00, i32 1 return _mm_nmacc_sd(a, b, c); } +TEST_CONSTEXPR(match_m128d(_mm_nmacc_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), -4.0, 0.0)); __m128 test_mm_nmsub_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_nmsub_ps @@ -137,27 +155,33 @@ TEST_CONSTEXPR(match_m128d(_mm_nmsub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0, __m128 test_mm_nmsub_ss(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_nmsub_ss - // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}} - // CHECK: [[NEG2:%.+]] = fneg <4 x float> %{{.+}} - // CHECK: [[A:%.+]] = extractelement <4 x float> [[NEG]], i64 0 - // CHECK: extractelement <4 x float> %{{.*}}, i64 0 - // CHECK: [[C:%.+]] = extractelement <4 x float> [[NEG2]], i64 0 - // CHECK: call float @llvm.fma.f32(float [[A]], float %{{.*}}, float [[C]]) - // CHECK: insertelement <4 x float> zeroinitializer, float %{{.*}}, i64 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: [[NEG:%.+]] = fneg float %{{.+}} + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: [[NEG2:%.+]] = fneg float %{{.+}} + // CHECK: call float @llvm.fma.f32(float [[NEG]], float %{{.*}}, float [[NEG2]]) + // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0 + // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 1 + // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 2 + // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 3 return _mm_nmsub_ss(a, b, c); } +TEST_CONSTEXPR(match_m128(_mm_nmsub_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 7.0f, 0.0f, 0.0f, 0.0f)); __m128d test_mm_nmsub_sd(__m128d a, __m128d b, __m128d c) { // CHECK-LABEL: test_mm_nmsub_sd - // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}} - // CHECK: [[NEG2:%.+]] = fneg <2 x double> %{{.+}} - // CHECK: [[A:%.+]] = extractelement <2 x double> [[NEG]], i64 0 - // CHECK: extractelement <2 x double> %{{.*}}, i64 0 - // CHECK: [[C:%.+]] = extractelement <2 x double> [[NEG2]], i64 0 - // CHECK: call double @llvm.fma.f64(double [[A]], double %{{.*}}, double [[C]]) - // CHECK: insertelement <2 x double> zeroinitializer, double %{{.*}}, i64 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: [[NEG:%.+]] = fneg double %{{.+}} + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: [[NEG2:%.+]] = fneg double %{{.+}} + // CHECK: call double @llvm.fma.f64(double [[NEG]], double %{{.*}}, double [[NEG2]]) + // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0 + // CHECK: insertelement <2 x double> %{{.*}}, double 0.000000e+00, i32 1 return _mm_nmsub_sd(a, b, c); } +TEST_CONSTEXPR(match_m128d(_mm_nmsub_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), 12.0, 0.0)); __m128 test_mm_maddsub_ps(__m128 a, __m128 b, __m128 c) { // CHECK-LABEL: test_mm_maddsub_ps _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits