https://github.com/markbhasawut updated https://github.com/llvm/llvm-project/pull/161563
>From 397aa430d10bf97fc3cf589f804066ff46b9841b Mon Sep 17 00:00:00 2001 From: Bhasawut Singhaphan <[email protected]> Date: Mon, 22 Sep 2025 17:09:08 +0700 Subject: [PATCH 1/4] [Headers][X86] Enable constexpr handling for MMX/SSE/AVX/AVX512 PMADDWD/PMADDUBSW intrinsics --- clang/include/clang/Basic/BuiltinsX86.td | 19 +++++++++++-------- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 16 ++++++++++++++++ clang/lib/AST/ExprConstant.cpp | 14 ++++++++++++++ 3 files changed, 41 insertions(+), 8 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index a0181b7ae8f9d..5ec22c1fbb6fc 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -123,13 +123,16 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in { def Op#d128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">; } - def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">; def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">; def psignb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">; def psignw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; def psignd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">; } + + let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { + def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">; + } } // AVX @@ -278,13 +281,14 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i def psllw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; def pslld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">; def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">; - def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">; def pslldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">; def psrldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">; } let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { + def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">; + def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">; def psllwi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, int)">; @@ -581,8 +585,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i def phsubw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; def phsubd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">; def phsubsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; - def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">; - def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">; def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">; def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">; @@ -619,6 +621,9 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi def pblendvb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">; + def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">; + def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">; + def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">; def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">; @@ -1378,10 +1383,6 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512> def subps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def pmaddubsw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>)">; - def pmaddwd512 : X86Builtin<"_Vector<16, int>(_Vector<32, short>, _Vector<32, short>)">; -} let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { def addss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">; @@ -1999,6 +2000,8 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512 } let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { + def pmaddubsw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>)">; + def pmaddwd512 : X86Builtin<"_Vector<16, int>(_Vector<32, short>, _Vector<32, short>)">; def psllv32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">; def pshufhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">; def pshuflw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">; diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index a3c4ba5447250..3feba20eb9f04 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2579,6 +2579,12 @@ static bool interp__builtin_elementwise_maxmin(InterpState &S, CodePtr OpPC, return true; } +static bool interp__builtin_ia32_pmadd(InterpState &S, CodePtr OpPC, + const CallExpr *Call, + unsigned BuiltinID) { + return true; // TODO: Implement the builtin. +} + static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC, const CallExpr *Call, unsigned BuiltinID) { @@ -3448,6 +3454,16 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return interp__builtin_elementwise_int_binop(S, OpPC, Call, llvm::APIntOps::avgCeilU); + case clang::X86::BI__builtin_ia32_pmaddubsw128: + case clang::X86::BI__builtin_ia32_pmaddubsw256: + case clang::X86::BI__builtin_ia32_pmaddubsw512: + return true; // TODO: Use interp__builtin_i32_pmadd. + + case clang::X86::BI__builtin_ia32_pmaddwd128: + case clang::X86::BI__builtin_ia32_pmaddwd256: + case clang::X86::BI__builtin_ia32_pmaddwd512: + return true; // TODO: Use interp__builtin_i32_pmadd. + case clang::X86::BI__builtin_ia32_pmulhuw128: case clang::X86::BI__builtin_ia32_pmulhuw256: case clang::X86::BI__builtin_ia32_pmulhuw512: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 7bf28d988f405..d71a8f7a383bc 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11778,6 +11778,20 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { case clang::X86::BI__builtin_ia32_pavgw512: return EvaluateBinOpExpr(llvm::APIntOps::avgCeilU); + case clang::X86::BI__builtin_ia32_pmaddubsw128: + case clang::X86::BI__builtin_ia32_pmaddwd128: + case clang::X86::BI__builtin_ia32_pmaddubsw256: + case clang::X86::BI__builtin_ia32_pmaddwd256: + case clang::X86::BI__builtin_ia32_pmaddubsw512: + case clang::X86::BI__builtin_ia32_pmaddwd512: + return true; // TODO: Handle __builtin_ia32_pmaddub + + case clang::X86::BI__builtin_ia32_pmaddwd128: + case clang::X86::BI__builtin_ia32_pmaddwd256: + case clang::X86::BI__builtin_ia32_pmaddwd512: + return true; // TODO: Handle __builtin_ia32_pmadd + }); + case clang::X86::BI__builtin_ia32_pmulhuw128: case clang::X86::BI__builtin_ia32_pmulhuw256: case clang::X86::BI__builtin_ia32_pmulhuw512: >From f6ca5cc38f074f4b6d4437118cab95f5ec6005d9 Mon Sep 17 00:00:00 2001 From: Bhasawut Singhaphan <[email protected]> Date: Wed, 24 Sep 2025 19:38:57 +0000 Subject: [PATCH 2/4] Modified InterpBuiltin.cpp and ExprConstant.cpp --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 45 ++++++++++++++++++++---- clang/lib/AST/ExprConstant.cpp | 44 ++++++++++++++++++----- 2 files changed, 75 insertions(+), 14 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 3feba20eb9f04..7cef140db9e2a 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2579,10 +2579,35 @@ static bool interp__builtin_elementwise_maxmin(InterpState &S, CodePtr OpPC, return true; } -static bool interp__builtin_ia32_pmadd(InterpState &S, CodePtr OpPC, - const CallExpr *Call, - unsigned BuiltinID) { - return true; // TODO: Implement the builtin. +static bool interp__builtin_ia32_pmadd( + InterpState &S, CodePtr OpPC, const CallExpr *Call, + llvm::function_ref<APInt(const APSInt &, const APSInt &)> Fn) { + assert(Call->getArg(0)->getType()->isVectorType() && + Call->getArg(1)->getType()->isVectorType()); + assert(VT->getElementType() == + Call->getArg(1)->getType()->castAs<VectorType>()->getElementType()); + assert(VT->getNumElements() == + Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements()); + assert(VT->getElementType()->isIntegralOrEnumerationType()); + const Pointer &RHS = S.Stk.pop<Pointer>(); + const Pointer &LHS = S.Stk.pop<Pointer>(); + const Pointer &Dst = S.Stk.peek<Pointer>(); + + const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>(); + PrimType ElemT = *S.getContext().classify(VT->getElementType()); + unsigned NumElems = VT->getNumElements(); + bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType(); + + for (unsigned I = 0; I != NumElems; ++I) { + INT_TYPE_SWITCH_NO_BOOL(ElemT, { + APSInt Elem1 = LHS.elem<T>(I).toAPSInt(); + APSInt Elem2 = RHS.elem<T>(I).toAPSInt(); + Dst.elem<T>(I) = static_cast<T>(APSInt(Fn(Elem1, Elem2), DestUnsigned)); + }); + } + + Dst.initializeAllElements(); + return true; } static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC, @@ -3457,12 +3482,20 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case clang::X86::BI__builtin_ia32_pmaddubsw128: case clang::X86::BI__builtin_ia32_pmaddubsw256: case clang::X86::BI__builtin_ia32_pmaddubsw512: - return true; // TODO: Use interp__builtin_i32_pmadd. + return interp__builtin_ia32_pmadd(S, OpPC, Call, + [](const APSInt &LoLHS, const APSInt &HiLHS, const APSInt &HiRHS, const APSInt &HiRHS) { + unsigned BitWidth = 2 * LHS.getBitWidth(); + return (LoLHS.zext(BitWidth) * LoRHS.sext(BitWidth)).sadd_sat((HiLHS.zext(BitWidth) * HiRHS.sext(BitWidth))); + }); case clang::X86::BI__builtin_ia32_pmaddwd128: case clang::X86::BI__builtin_ia32_pmaddwd256: case clang::X86::BI__builtin_ia32_pmaddwd512: - return true; // TODO: Use interp__builtin_i32_pmadd. + return interp__builtin_ia32_pmadd(S, OpPC, Call, + [](const APSInt &LoLHS, const APSInt &HiLHS, const APSInt &HiRHS, const APSInt &HiRHS) { + unsigned BitWidth = 2 * LHS.getBitWidth(); + return (LoLHS.sext(BitWidth) * LoRHS.sext(BitWidth)) + (HiLHS.sext(BitWidth) * HiRHS.sext(BitWidth)); + }); case clang::X86::BI__builtin_ia32_pmulhuw128: case clang::X86::BI__builtin_ia32_pmulhuw256: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index d71a8f7a383bc..f2a01dbc9b93f 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11779,18 +11779,46 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return EvaluateBinOpExpr(llvm::APIntOps::avgCeilU); case clang::X86::BI__builtin_ia32_pmaddubsw128: - case clang::X86::BI__builtin_ia32_pmaddwd128: case clang::X86::BI__builtin_ia32_pmaddubsw256: - case clang::X86::BI__builtin_ia32_pmaddwd256: case clang::X86::BI__builtin_ia32_pmaddubsw512: - case clang::X86::BI__builtin_ia32_pmaddwd512: - return true; // TODO: Handle __builtin_ia32_pmaddub - case clang::X86::BI__builtin_ia32_pmaddwd128: case clang::X86::BI__builtin_ia32_pmaddwd256: - case clang::X86::BI__builtin_ia32_pmaddwd512: - return true; // TODO: Handle __builtin_ia32_pmadd - }); + case clang::X86::BI__builtin_ia32_pmaddwd512: { + APValue SourceLHS, SourceRHS; + if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) || + !EvaluateAsRValue(Info, E->getArg(1), SourceRHS)) + return false; + + unsigned SourceLen = SourceLHS.getVectorLength(); + bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType(); + SmallVector<APValue, 4> ResultElements; + ResultElements.reserve(SourceLen / 2); + + for (unsigned EltNum = 0; EltNum < SourceLen; ++EltNum) { + const APSInt &LoLHS = SourceLHS.getVectorElt(EltNum).getInt(); + const APSInt &LoRHS = SourceRHS.getVectorElt(EltNum).getInt(); + const APSInt &HiLHS = SourceLHS.getVectorElt(EltNum).getInt(); + const APSInt &HiRHS = SourceRHS.getVectorElt(EltNum).getInt(); + unsigned BitWidth = 2 * LHS.getBitWidth(); + + switch (E->getBuiltinCallee()) { + case clang::X86::BI__builtin_ia32_pmaddubsw128: + case clang::X86::BI__builtin_ia32_pmaddubsw256: + case clang::X86::BI__builtin_ia32_pmaddubsw512: + ResultElements.push_back( + APValue(APSInt((LoLHS.zext(BitWidth) * LoRHS.sext(BitWidth)).sadd_sat((HiLHS.zext(BitWidth) * HiRHS.sext(BitWidth)))), DestUnsigned)); + break; + case clang::X86::BI__builtin_ia32_pmaddwd128: + case clang::X86::BI__builtin_ia32_pmaddwd256: + case clang::X86::BI__builtin_ia32_pmaddwd512: + ResultElements.push_back( + APValue(APSInt((LoLHS.sext(BitWidth) * LoRHS.sext(BitWidth)) + (HiLHS.sext(BitWidth) * HiRHS.sext(BitWidth))), DestUnsigned)); + break; + } + } + + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } case clang::X86::BI__builtin_ia32_pmulhuw128: case clang::X86::BI__builtin_ia32_pmulhuw256: >From 288d076339ac18a900011faccc6e0ea4f9a68890 Mon Sep 17 00:00:00 2001 From: Bhasawut Singhaphan <[email protected]> Date: Wed, 1 Oct 2025 15:16:25 +0000 Subject: [PATCH 3/4] Update MMX/SSE/AVX/AVX512 PMADDWD/PMADDUBSW intrinsics to be used in constexpr --- clang/lib/Headers/avx2intrin.h | 4 ++-- clang/lib/Headers/avx512bwintrin.h | 12 ++++++------ clang/lib/Headers/avx512vlbwintrin.h | 16 ++++++++-------- clang/lib/Headers/emmintrin.h | 2 +- clang/lib/Headers/mmintrin.h | 6 +++--- clang/lib/Headers/tmmintrin.h | 11 +++++++---- 6 files changed, 27 insertions(+), 24 deletions(-) diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index 31759c5386d9f..c6bff41973ef8 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -1035,7 +1035,7 @@ _mm256_hsubs_epi16(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maddubs_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b); @@ -1067,7 +1067,7 @@ _mm256_maddubs_epi16(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_madd_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b); diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index c36bd814725fa..473fe94af65d8 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -1064,12 +1064,12 @@ _mm512_maskz_mulhi_epu16(__mmask32 __U, __m512i __A, __m512i __B) { (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maddubs_epi16(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmaddubsw512((__v64qi)__X, (__v64qi)__Y); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U, @@ -1077,26 +1077,26 @@ _mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X, (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U, (__v32hi)_mm512_maddubs_epi16(__X, __Y), (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_madd_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmaddwd512((__v32hi)__A, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_madd_epi16(__A, __B), (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_madd_epi16(__A, __B), diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h index 5e6daa8f7b260..81e4cbb9615c1 100644 --- a/clang/lib/Headers/avx512vlbwintrin.h +++ b/clang/lib/Headers/avx512vlbwintrin.h @@ -1295,21 +1295,21 @@ _mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I, (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_maddubs_epi16(__X, __Y), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_maddubs_epi16(__X, __Y), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1317,35 +1317,35 @@ _mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X, (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_maddubs_epi16(__X, __Y), (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_madd_epi16(__A, __B), (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_madd_epi16(__A, __B), (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_madd_epi16(__A, __B), (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_madd_epi16(__A, __B), diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h index 6597e7e7d4030..2b1fd7e3a8b07 100644 --- a/clang/lib/Headers/emmintrin.h +++ b/clang/lib/Headers/emmintrin.h @@ -2290,7 +2290,7 @@ _mm_avg_epu16(__m128i __a, __m128i __b) { /// A 128-bit signed [8 x i16] vector. /// \returns A 128-bit signed [4 x i32] vector containing the sums of products /// of both parameters. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_madd_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); } diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h index 5f617530b6f78..145980410e2ec 100644 --- a/clang/lib/Headers/mmintrin.h +++ b/clang/lib/Headers/mmintrin.h @@ -679,11 +679,11 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [2 x i32] containing the sums of /// products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_madd_pi16(__m64 __m1, __m64 __m2) { - return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1), - (__v8hi)__anyext128(__m2))); + return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__zext128(__m1), + (__v8hi)__zext128(__m2))); } /// Multiplies each 16-bit signed integer element of the first 64-bit diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h index d40f0c56b2c5a..8d6eddbfcb731 100644 --- a/clang/lib/Headers/tmmintrin.h +++ b/clang/lib/Headers/tmmintrin.h @@ -23,6 +23,9 @@ #define __trunc64(x) \ (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0) +#define __zext128(x) \ + (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ + 1, 2, 3) #define __anyext128(x) \ (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ 1, -1, -1) @@ -504,7 +507,7 @@ _mm_hsubs_pi16(__m64 __a, __m64 __b) /// \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n /// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n /// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15) -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_maddubs_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b); @@ -534,11 +537,11 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b) /// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n /// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n /// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_maddubs_pi16(__m64 __a, __m64 __b) { - return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a), - (__v16qi)__anyext128(__b))); + return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__zext128(__a), + (__v16qi)__zext128(__b))); } /// Multiplies packed 16-bit signed integer values, truncates the 32-bit >From 850c53690d08514abd2b548a14482ed758d0ab3b Mon Sep 17 00:00:00 2001 From: Bhasawut Singhaphan <[email protected]> Date: Thu, 2 Oct 2025 01:38:20 +0700 Subject: [PATCH 4/4] Address clang-format issue --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 2 +- clang/lib/Headers/avx2intrin.h | 8 +++----- clang/lib/Headers/emmintrin.h | 4 ++-- clang/lib/Headers/mmintrin.h | 7 +++---- clang/lib/Headers/tmmintrin.h | 12 +++++------- 5 files changed, 14 insertions(+), 19 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 7cef140db9e2a..a2a8a8d7b8554 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2592,7 +2592,7 @@ static bool interp__builtin_ia32_pmadd( const Pointer &RHS = S.Stk.pop<Pointer>(); const Pointer &LHS = S.Stk.pop<Pointer>(); const Pointer &Dst = S.Stk.peek<Pointer>(); - + const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>(); PrimType ElemT = *S.getContext().classify(VT->getElementType()); unsigned NumElems = VT->getNumElements(); diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index c6bff41973ef8..4aaca2db8236a 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -1036,9 +1036,8 @@ _mm256_hsubs_epi16(__m256i __a, __m256i __b) /// A 256-bit vector containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maddubs_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b); +_mm256_maddubs_epi16(__m256i __a, __m256i __b) { + return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b); } /// Multiplies corresponding 16-bit elements of two 256-bit vectors of @@ -1068,8 +1067,7 @@ _mm256_maddubs_epi16(__m256i __a, __m256i __b) /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [8 x i32] containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_madd_epi16(__m256i __a, __m256i __b) -{ +_mm256_madd_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b); } diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h index 2b1fd7e3a8b07..454e9a2504949 100644 --- a/clang/lib/Headers/emmintrin.h +++ b/clang/lib/Headers/emmintrin.h @@ -2290,8 +2290,8 @@ _mm_avg_epu16(__m128i __a, __m128i __b) { /// A 128-bit signed [8 x i16] vector. /// \returns A 128-bit signed [4 x i32] vector containing the sums of products /// of both parameters. -static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_madd_epi16(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_madd_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); } diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h index 145980410e2ec..aca78e6986ad9 100644 --- a/clang/lib/Headers/mmintrin.h +++ b/clang/lib/Headers/mmintrin.h @@ -680,10 +680,9 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2) { /// \returns A 64-bit integer vector of [2 x i32] containing the sums of /// products of both parameters. static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_madd_pi16(__m64 __m1, __m64 __m2) -{ - return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__zext128(__m1), - (__v8hi)__zext128(__m2))); +_mm_madd_pi16(__m64 __m1, __m64 __m2) { + return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__zext128(__m1), + (__v8hi)__zext128(__m2))); } /// Multiplies each 16-bit signed integer element of the first 64-bit diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h index 8d6eddbfcb731..49c9d7c0eee92 100644 --- a/clang/lib/Headers/tmmintrin.h +++ b/clang/lib/Headers/tmmintrin.h @@ -508,9 +508,8 @@ _mm_hsubs_pi16(__m64 __a, __m64 __b) /// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n /// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15) static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR -_mm_maddubs_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b); +_mm_maddubs_epi16(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b); } /// Multiplies corresponding pairs of packed 8-bit unsigned integer @@ -538,10 +537,9 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b) /// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n /// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR -_mm_maddubs_pi16(__m64 __a, __m64 __b) -{ - return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__zext128(__a), - (__v16qi)__zext128(__b))); +_mm_maddubs_pi16(__m64 __a, __m64 __b) { + return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__zext128(__a), + (__v16qi)__zext128(__b))); } /// Multiplies packed 16-bit signed integer values, truncates the 32-bit _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
