https://github.com/woruyu updated https://github.com/llvm/llvm-project/pull/156003
>From 751b39b15014e148ac0ea245b4f5163a3d1fad2a Mon Sep 17 00:00:00 2001 From: woruyu <[email protected]> Date: Fri, 19 Sep 2025 00:21:36 -0900 Subject: [PATCH 1/4] [Headers][X86] VectorExprEvaluator::VisitCallExpr - allow SSE/AVX2/AVX512 pack intrinsics to be used in constexpr --- clang/include/clang/Basic/BuiltinsX86.td | 25 ++++---- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 70 ++++++++++++++++++++++ clang/lib/AST/ExprConstant.cpp | 62 ++++++++++++++++++- clang/lib/Headers/avx2intrin.h | 20 +++---- clang/lib/Headers/avx512bwintrin.h | 20 +++---- clang/lib/Headers/emmintrin.h | 12 ++-- clang/lib/Headers/mmintrin.h | 27 ++++----- clang/lib/Headers/smmintrin.h | 4 +- clang/test/CodeGen/X86/avx2-builtins.c | 4 ++ clang/test/CodeGen/X86/avx512bw-builtins.c | 4 ++ clang/test/CodeGen/X86/mmx-builtins.c | 3 + clang/test/CodeGen/X86/sse2-builtins.c | 3 + clang/test/CodeGen/X86/sse41-builtins.c | 1 + 13 files changed, 196 insertions(+), 59 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 044c755d4d7cf..1f0ffff09eaae 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -93,9 +93,6 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in { } let Features = "sse2" in { - def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">; - def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">; - def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">; def vec_ext_v2di : X86Builtin<"long long int(_Vector<2, long long int>, _Constant int)">; def vec_ext_v4si : X86Builtin<"int(_Vector<4, int>, _Constant int)">; def vec_ext_v4sf : X86Builtin<"float(_Vector<4, float>, _Constant int)">; @@ -108,6 +105,9 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in { def pavgw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">; def pmulhw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; def pmulhuw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">; + def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">; + def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">; + def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">; } let Features = "sse3" in { @@ -312,7 +312,6 @@ let Features = "ssse3", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">; - def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">; def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">; def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">; def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">; @@ -338,6 +337,7 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVector def pblendvb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">; def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">; + def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">; } let Features = "sse4.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { @@ -571,10 +571,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">; - def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">; - def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">; - def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">; - def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">; def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">; def phaddw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; def phaddd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">; @@ -647,6 +643,10 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">; def insert128i256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">; + def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">; + def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">; + def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">; + def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">; } let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { @@ -1308,11 +1308,14 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512> let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">; - def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">; + def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">; +} + +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def packsswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">; - def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">; + def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">; def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">; - def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">; + def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">; } let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 77729a5d67c87..26cbff74f2b84 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "../ExprConstShared.h" #include "Boolean.h" +#include "ByteCode/FixedPoint.h" #include "EvalEmitter.h" #include "Interp.h" #include "InterpBuiltinBitCast.h" @@ -2604,6 +2605,51 @@ static bool interp__builtin_elementwise_int_binop( return true; } +static bool interp__builtin_x86_pack( + InterpState &S, CodePtr, const CallExpr *E, + llvm::function_ref<APSInt(const APSInt &)> narrowElement) { + const auto *VT0 = E->getArg(0)->getType()->castAs<VectorType>(); + const auto *VT1 = E->getArg(1)->getType()->castAs<VectorType>(); + assert(VT0 && VT1 && "pack builtin VT0 and VT1 must be VectorType"); + assert(VT0->getElementType() == VT1->getElementType() && + VT0->getNumElements() == VT1->getNumElements() && + "pack builtin VT0 and VT1 ElementType must be same"); + + const Pointer &RHS = S.Stk.pop<Pointer>(); + const Pointer &LHS = S.Stk.pop<Pointer>(); + const Pointer &Dst = S.Stk.peek<Pointer>(); + + const ASTContext &ASTCtx = S.getASTContext(); + const unsigned SrcBits = ASTCtx.getIntWidth(VT0->getElementType()); + const unsigned LHSVecLen = VT0->getNumElements(); + const unsigned VectorBits = LHSVecLen * SrcBits; + const unsigned SrcPerLane = VectorBits >= 128 ? (128 / SrcBits) : LHSVecLen; + const unsigned Lanes = VectorBits >= 128 ? (VectorBits / 128) : 1; + + PrimType SrcT = *S.getContext().classify(VT0->getElementType()); + PrimType DstT = *S.getContext().classify(getElemType(Dst)); + + for (unsigned Lane = 0; Lane != Lanes; ++Lane) { + const unsigned BaseSrc = Lane * SrcPerLane; + const unsigned BaseDst = Lane * (2 * SrcPerLane); + + for (unsigned I = 0; I != SrcPerLane; ++I) { + INT_TYPE_SWITCH_NO_BOOL(SrcT, { + APSInt A = LHS.elem<T>(BaseSrc + I).toAPSInt(); + APSInt B = RHS.elem<T>(BaseSrc + I).toAPSInt(); + APSInt AO = narrowElement(A); + APSInt BO = narrowElement(B); + + assignInteger(S, Dst.atIndex(BaseDst + I), DstT, AO); + assignInteger(S, Dst.atIndex(BaseDst + SrcPerLane + I), DstT, BO); + }); + } + } + + Dst.initializeAllElements(); + return true; +} + static bool interp__builtin_elementwise_maxmin(InterpState &S, CodePtr OpPC, const CallExpr *Call, unsigned BuiltinID) { @@ -3477,6 +3523,30 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, } return LHS.lshr(RHS.getZExtValue()); }); + case clang::X86::BI__builtin_ia32_packsswb128: + case clang::X86::BI__builtin_ia32_packsswb256: + case clang::X86::BI__builtin_ia32_packsswb512: + case clang::X86::BI__builtin_ia32_packssdw128: + case clang::X86::BI__builtin_ia32_packssdw256: + case clang::X86::BI__builtin_ia32_packssdw512: + return interp__builtin_x86_pack(S, OpPC, Call, [](const APSInt &Src) { + APInt Value = APSInt(Src).truncSSat(Src.getBitWidth() / 2); + return APSInt(Value, /*isUnsigned=*/false); + }); + case clang::X86::BI__builtin_ia32_packusdw128: + case clang::X86::BI__builtin_ia32_packusdw256: + case clang::X86::BI__builtin_ia32_packusdw512: + case clang::X86::BI__builtin_ia32_packuswb128: + case clang::X86::BI__builtin_ia32_packuswb256: + case clang::X86::BI__builtin_ia32_packuswb512: + return interp__builtin_x86_pack(S, OpPC, Call, [](const APSInt &Src) { + unsigned DstBits = Src.getBitWidth() / 2; + if (Src.isNegative()) + return APSInt(APInt::getZero(DstBits), /*isUnsigned=*/true); + if (Src.isIntN(DstBits)) + return APSInt(Src.trunc(DstBits), /*isUnsigned=*/true); + return APSInt(APInt::getAllOnes(DstBits), /*isUnsigned=*/true); + }); case clang::X86::BI__builtin_ia32_vprotbi: case clang::X86::BI__builtin_ia32_vprotdi: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 3b9ca82910033..b4a353f059f16 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11575,6 +11575,43 @@ static bool handleVectorElementCast(EvalInfo &Info, const FPOptions FPO, return false; } +static bool +evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result, + llvm::function_ref<APSInt(const APSInt &)> narrowElement) { + APValue LHS, RHS; + if (!EvaluateAsRValue(Info, E->getArg(0), LHS) || + !EvaluateAsRValue(Info, E->getArg(1), RHS)) + return false; + + unsigned LHSVecLen = LHS.getVectorLength(); + unsigned RHSVecLen = RHS.getVectorLength(); + + assert(LHSVecLen != 0 && LHSVecLen == RHSVecLen && + "pack builtin LHSVecLen must equal to RHSVecLen"); + + const VectorType *VT0 = E->getArg(0)->getType()->castAs<VectorType>(); + const unsigned SrcBits = Info.Ctx.getIntWidth(VT0->getElementType()); + const unsigned VectorBits = LHSVecLen * SrcBits; + const unsigned srcPerLane = VectorBits >= 128 ? 128 / SrcBits : LHSVecLen; + const unsigned lanes = VectorBits >= 128 ? VectorBits / 128 : 1; + + SmallVector<APValue, 64> Out; + Out.reserve(LHSVecLen + RHSVecLen); + + for (unsigned lane = 0; lane != lanes; ++lane) { + unsigned base = lane * srcPerLane; + for (unsigned i = 0; i != srcPerLane; ++i) + Out.emplace_back( + APValue(narrowElement(LHS.getVectorElt(base + i).getInt()))); + for (unsigned i = 0; i != srcPerLane; ++i) + Out.emplace_back( + APValue(narrowElement(RHS.getVectorElt(base + i).getInt()))); + } + + Result = APValue(Out.data(), Out.size()); + return true; +} + bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { if (!IsConstantEvaluatedBuiltinCall(E)) return ExprEvaluatorBaseTy::VisitCallExpr(E); @@ -11768,7 +11805,30 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { } return LHS.lshr(RHS.getZExtValue()); }); - + case X86::BI__builtin_ia32_packsswb128: + case X86::BI__builtin_ia32_packsswb256: + case X86::BI__builtin_ia32_packsswb512: + case X86::BI__builtin_ia32_packssdw128: + case X86::BI__builtin_ia32_packssdw256: + case X86::BI__builtin_ia32_packssdw512: + return evalPackBuiltin(E, Info, Result, [](const APSInt &Src) { + APInt Value = APSInt(Src).truncSSat(Src.getBitWidth() / 2); + return APSInt(Value, /*isUnsigned=*/false); + }); + case X86::BI__builtin_ia32_packusdw128: + case X86::BI__builtin_ia32_packusdw256: + case X86::BI__builtin_ia32_packusdw512: + case X86::BI__builtin_ia32_packuswb128: + case X86::BI__builtin_ia32_packuswb256: + case X86::BI__builtin_ia32_packuswb512: + return evalPackBuiltin(E, Info, Result, [](const APSInt &Src) { + unsigned DstBits = Src.getBitWidth() / 2; + if (Src.isNegative()) + return APSInt(APInt::getZero(DstBits), /*isUnsigned=*/true); + if (Src.isIntN(DstBits)) + return APSInt(Src.trunc(DstBits), /*isUnsigned=*/true); + return APSInt(APInt::getAllOnes(DstBits), /*isUnsigned=*/true); + }); case clang::X86::BI__builtin_ia32_pmuldq128: case clang::X86::BI__builtin_ia32_pmuldq256: case clang::X86::BI__builtin_ia32_pmuldq512: diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index e35c159fec7fd..a62c31e107a60 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -165,9 +165,8 @@ _mm256_abs_epi32(__m256i __a) { /// A 256-bit vector of [16 x i16] used to generate result[127:64] and /// result[255:192]. /// \returns A 256-bit integer vector containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_packs_epi16(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_packs_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b); } @@ -197,9 +196,8 @@ _mm256_packs_epi16(__m256i __a, __m256i __b) /// A 256-bit vector of [8 x i32] used to generate result[127:64] and /// result[255:192]. /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_packs_epi32(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_packs_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b); } @@ -228,9 +226,8 @@ _mm256_packs_epi32(__m256i __a, __m256i __b) /// A 256-bit vector of [16 x i16] used to generate result[127:64] and /// result[255:192]. /// \returns A 256-bit integer vector containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_packus_epi16(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_packus_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b); } @@ -260,9 +257,8 @@ _mm256_packus_epi16(__m256i __a, __m256i __b) /// A 256-bit vector of [8 x i32] used to generate result[127:64] and /// result[255:192]. /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_packus_epi32(__m256i __V1, __m256i __V2) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_packus_epi32(__m256i __V1, __m256i __V2) { return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2); } diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index 8911c6cc220be..1a566739b50eb 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -516,9 +516,8 @@ _mm512_maskz_abs_epi16(__mmask32 __U, __m512i __A) { (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_packs_epi32(__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_packs_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B); } @@ -538,9 +537,8 @@ _mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_packs_epi16(__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_packs_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B); } @@ -560,9 +558,8 @@ _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B) (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_packus_epi32(__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_packus_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B); } @@ -582,9 +579,8 @@ _mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_packus_epi16(__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_packus_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B); } diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h index 12260ec6ea14c..b5277acb33ff9 100644 --- a/clang/lib/Headers/emmintrin.h +++ b/clang/lib/Headers/emmintrin.h @@ -4159,8 +4159,8 @@ void _mm_mfence(void); /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are /// written to the higher 64 bits of the result. /// \returns A 128-bit vector of [16 x i8] containing the converted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_packs_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); } @@ -4182,8 +4182,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, /// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values /// are written to the higher 64 bits of the result. /// \returns A 128-bit vector of [8 x i16] containing the converted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_packs_epi32(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); } @@ -4205,8 +4205,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are /// written to the higher 64 bits of the result. /// \returns A 128-bit vector of [16 x i8] containing the converted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_packus_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); } diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h index 18e2c2154362a..5f617530b6f78 100644 --- a/clang/lib/Headers/mmintrin.h +++ b/clang/lib/Headers/mmintrin.h @@ -156,11 +156,10 @@ _mm_cvtm64_si64(__m64 __m) /// written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_packs_pi16(__m64 __m1, __m64 __m2) -{ - return __trunc64(__builtin_ia32_packsswb128( - (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){})); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_packs_pi16(__m64 __m1, __m64 __m2) { + return __trunc64(__builtin_ia32_packsswb128( + (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){})); } /// Converts, with saturation, 32-bit signed integers from both 64-bit integer @@ -182,11 +181,10 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2) /// written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_packs_pi32(__m64 __m1, __m64 __m2) -{ - return __trunc64(__builtin_ia32_packssdw128( - (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){})); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_packs_pi32(__m64 __m1, __m64 __m2) { + return __trunc64(__builtin_ia32_packssdw128( + (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){})); } /// Converts, with saturation, 16-bit signed integers from both 64-bit integer @@ -208,11 +206,10 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2) /// written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_packs_pu16(__m64 __m1, __m64 __m2) -{ - return __trunc64(__builtin_ia32_packuswb128( - (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){})); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_packs_pu16(__m64 __m1, __m64 __m2) { + return __trunc64(__builtin_ia32_packuswb128( + (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){})); } /// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8] diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h index 6319fdbbeb8f0..c1c9c3d47f805 100644 --- a/clang/lib/Headers/smmintrin.h +++ b/clang/lib/Headers/smmintrin.h @@ -1466,8 +1466,8 @@ _mm_cvtepu32_epi64(__m128i __V) { /// A 128-bit vector of [4 x i32]. The converted [4 x i16] values are /// written to the higher 64 bits of the result. /// \returns A 128-bit vector of [8 x i16] containing the converted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1, - __m128i __V2) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_packus_epi32(__m128i __V1, __m128i __V2) { return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2); } diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index b6b54172ea186..2d3702c4929b8 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -1030,24 +1030,28 @@ __m256i test_mm256_packs_epi16(__m256i a, __m256i b) { // CHECK: call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_packs_epi16(a, b); } +TEST_CONSTEXPR(match_v32qi(_mm256_packs_epi16((__m256i)(__v16hi){130, -200, 127, -128, 300, -1000, 42, -42, 500, -500, 1, -1, 128, -129, 256, -256}, (__m256i)(__v16hi){0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90}), 127, -128, 127, -128, 127, -128, 42, -42, 0, 1, -1, 127, -128, 127, 127, -128, 127, -128, 1, -1, 127, -128, 127, -128, 127, -128, 127, -128, 127, -128, 90, -90)); __m256i test_mm256_packs_epi32(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_packs_epi32 // CHECK: call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}) return _mm256_packs_epi32(a, b); } +TEST_CONSTEXPR(match_v16hi(_mm256_packs_epi32((__m256i)(__v8si){40000, -50000, 32767, -32768, 70000, -70000, 42, -42}, (__m256i)(__v8si){0, 1, -1, 65536, -1000000, 1000000, 32768, -32769}), 32767, -32768, 32767, -32768, 0, 1, -1, 32767, 32767, -32768, 42, -42, -32768, 32767, 32767, -32768)); __m256i test_mm256_packs_epu16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_packs_epu16 // CHECK: call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_packus_epi16(a, b); } +TEST_CONSTEXPR(match_v32qi(_mm256_packus_epi16((__m256i)(__v16hi){-1, 0, 1, 127, 128, 255, 256, -200, 300, 42, -42, 500, 20000, -32768, 129, -129}, (__m256i)(__v16hi){0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90}), 0, 0, 1, 127, -128, -1, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0, -1, 42, 0, -1, -1, 0, -127, 0, -1, 0, 127, 0, -1, 0, 90, 0)); __m256i test_mm256_packs_epu32(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_packs_epu32 // CHECK: call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}) return _mm256_packus_epi32(a, b); } +TEST_CONSTEXPR(match_v16hi(_mm256_packus_epi32((__m256i)(__v8si){40000, -50000, 32767, -32768, 70000, -70000, 42, -42}, (__m256i)(__v8si){0, 1, -1, 65536, -1000000, 1000000, 32768, -32769}), -25536, 0, 32767, 0, 0, 1, 0, -1, -1, 0, 42, 0, 0, -1, -32768, 0)); __m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_permute2x128_si256 diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c index 0be4d78d814b1..bfdcd2503c8c5 100644 --- a/clang/test/CodeGen/X86/avx512bw-builtins.c +++ b/clang/test/CodeGen/X86/avx512bw-builtins.c @@ -956,6 +956,7 @@ __m512i test_mm512_packs_epi32(__m512i __A, __m512i __B) { // CHECK: @llvm.x86.avx512.packssdw.512 return _mm512_packs_epi32(__A,__B); } +TEST_CONSTEXPR(match_v32hi(_mm512_packs_epi32((__m512i)(__v16si){40000, -50000, 32767, -32768, 70000, -70000, 42, -42, 0, 1, -1, 30000, 32768, -32769, 65535, -65536}, (__m512i)(__v16si){0, 1, -1, 65536, -1000000, 1000000, 32768, -32769, 123456, -123456, 32767, -32768, 22222, -22222, 40000, -40000}), 32767, -32768, 32767, -32768, 0, 1, -1, 32767, 32767, -32768, 42, -42, -32768, 32767, 32767, -32768, 0, 1, -1, 30000, 32767, -32768, 32767, -32768, 32767, -32768, 32767, -32768, 22222, -22222, 32767, -32768)); __m512i test_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_packs_epi32 // CHECK: @llvm.x86.avx512.packssdw.512 @@ -973,6 +974,7 @@ __m512i test_mm512_packs_epi16(__m512i __A, __m512i __B) { // CHECK: @llvm.x86.avx512.packsswb.512 return _mm512_packs_epi16(__A,__B); } +TEST_CONSTEXPR(match_v64qi(_mm512_packs_epi16((__m512i)(__v32hi){130, -200, 127, -128, 300, -1000, 42, -42, 32767, -32767, 127, -128, 30000, -30000, 90, -90, 130, -200, 0, -1, 126, -127, 128, -129, 500, -500, 7, -7, 255, -255, 127, -128}, (__m512i)(__v32hi){0, 1, -1, 255, -129, 128, 20000, -32768, 5, -5, 100, -100, 127, -128, 512, -512, 1, 2, -2, 300, -300, 127, -128, 42, 0, 1, -1, 127, -128, 90, -90, -32768}), 127, -128, 127, -128, 127, -128, 42, -42, 0, 1, -1, 127, -128, 127, 127, -128, 127, -128, 127, -128, 127, -128, 90, -90, 5, -5, 100, -100, 127, -128, 127, -128, 127, -128, 0, -1, 126, -127, 127, -128, 1, 2, -2, 127, -128, 127, -128, 42, 127, -128, 7, -7, 127, -128, 127, -128, 0, 1, -1, 127, -128, 90, -90, -128)); __m512i test_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_packs_epi16 // CHECK: @llvm.x86.avx512.packsswb.512 @@ -990,6 +992,7 @@ __m512i test_mm512_packus_epi32(__m512i __A, __m512i __B) { // CHECK: @llvm.x86.avx512.packusdw.512 return _mm512_packus_epi32(__A,__B); } +TEST_CONSTEXPR(match_v32hi(_mm512_packus_epi32((__m512i)(__v16si){40000, -50000, 32767, -32768, 70000, -70000, 42, -42, 0, 1, -1, 65535, 32768, -32769, 22222, -22222}, (__m512i)(__v16si){0, 1, -1, 65536, -1000000, 1000000, 32768, -32769, 123456, -123456, 32767, -32768, 40000, -40000, 65535, 0}), -25536, 0, 32767, 0, 0, 1, 0, -1, -1, 0, 42, 0, 0, -1, -32768, 0, 0, 1, 0, -1, -1, 0, 32767, 0, -32768, 0, 22222, 0, -25536, 0, -1, 0)); __m512i test_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_packus_epi32 // CHECK: @llvm.x86.avx512.packusdw.512 @@ -1007,6 +1010,7 @@ __m512i test_mm512_packus_epi16(__m512i __A, __m512i __B) { // CHECK: @llvm.x86.avx512.packuswb.512 return _mm512_packus_epi16(__A,__B); } +TEST_CONSTEXPR(match_v64qi(_mm512_packus_epi16((__m512i)(__v32hi){-1, 0, 1, 127, 128, 255, 256, -200, 300, 42, -42, 500, 20000, -32768, 129, -129, -1, 0, 1, 127, 128, 255, 256, -200, 300, 42, -42, 500, 20000, -32768, 129, -129}, (__m512i)(__v32hi){0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90, 0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90}), 0, 0, 1, 127, -128, -1, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0, -1, 42, 0, -1, -1, 0, -127, 0, -1, 0, 127, 0, -1, 0, 90, 0, 0, 0, 1, 127, -128, -1, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0, -1, 42, 0, -1, -1, 0, -127, 0, -1, 0, 127, 0, -1, 0, 90, 0)); __m512i test_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_packus_epi16 // CHECK: @llvm.x86.avx512.packuswb.512 diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c index 43d9ec5e6cc8b..e8faf8f937f9d 100644 --- a/clang/test/CodeGen/X86/mmx-builtins.c +++ b/clang/test/CodeGen/X86/mmx-builtins.c @@ -448,18 +448,21 @@ __m64 test_mm_packs_pi16(__m64 a, __m64 b) { // CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128( return _mm_packs_pi16(a, b); } +TEST_CONSTEXPR(match_v8qi(_mm_packs_pi16((__m64)(__v4hi){130, -200, 127, -128}, (__m64)(__v4hi){0, 1, -1, 255}), 127, -128, 127, -128, 0, 1, -1, 127)); __m64 test_mm_packs_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_packs_pi32 // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128( return _mm_packs_pi32(a, b); } +TEST_CONSTEXPR(match_v4hi(_mm_packs_pi32((__m64)(__v2si){40000, -50000}, (__m64)(__v2si){0, 70000}), 32767, -32768, 0, 32767)); __m64 test_mm_packs_pu16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_packs_pu16 // CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128( return _mm_packs_pu16(a, b); } +TEST_CONSTEXPR(match_v8qi(_mm_packs_pu16((__m64)(__v4hi){-1, 0, 128, 300}, (__m64)(__v4hi){255, -200, 42, -42}), 0, 0, -128, -1, -1, 0, 42, 0)); __m64 test_mm_sad_pu8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sad_pu8 diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c index 0ba32bb230cdd..035f2df5559d6 100644 --- a/clang/test/CodeGen/X86/sse2-builtins.c +++ b/clang/test/CodeGen/X86/sse2-builtins.c @@ -1020,18 +1020,21 @@ __m128i test_mm_packs_epi16(__m128i A, __m128i B) { // CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_packs_epi16(A, B); } +TEST_CONSTEXPR(match_v16qi(_mm_packs_epi16((__m128i)(__v8hi){130, -200, 127, -128, 300, -1000, 42, -42}, (__m128i)(__v8hi){0, 1, -1, 255, -129, 128, 20000, -32768}), 127, -128, 127, -128, 127, -128, 42, -42, 0, 1, -1, 127, -128, 127, 127, -128)); __m128i test_mm_packs_epi32(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_packs_epi32 // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) return _mm_packs_epi32(A, B); } +TEST_CONSTEXPR(match_v8hi(_mm_packs_epi32((__m128i)(__v4si){40000, -50000, 32767, -32768}, (__m128i)(__v4si){0, 1, -1, 70000}), 32767, -32768, 32767, -32768, 0, 1, -1, 32767)); __m128i test_mm_packus_epi16(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_packus_epi16 // CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_packus_epi16(A, B); } +TEST_CONSTEXPR(match_v16qi(_mm_packus_epi16((__m128i)(__v8hi){-1, 0, 1, 127, 300, -1000, 255, -42}, (__m128i)(__v8hi){0, 1, -1, 255, -129, 128, 20000, -32768}), 0, 0, 1, 127, -1, 0, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0)); void test_mm_pause(void) { // CHECK-LABEL: test_mm_pause diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c index c7265b188d572..3c3724643870e 100644 --- a/clang/test/CodeGen/X86/sse41-builtins.c +++ b/clang/test/CodeGen/X86/sse41-builtins.c @@ -399,6 +399,7 @@ __m128i test_mm_packus_epi32(__m128i x, __m128i y) { // CHECK: call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) return _mm_packus_epi32(x, y); } +TEST_CONSTEXPR(match_v8hi(_mm_packus_epi32((__m128i)(__v4si){40000, -50000, 32767, -32768}, (__m128i)(__v4si){0, 1, -1, 70000}), -25536, 0, 32767, 0, 0, 1, 0, -1)); __m128d test_mm_round_pd(__m128d x) { // CHECK-LABEL: test_mm_round_pd >From e09530bc91ad0e4f976fd53010f449caeb109da7 Mon Sep 17 00:00:00 2001 From: woruyu <[email protected]> Date: Sun, 21 Sep 2025 18:33:54 -0900 Subject: [PATCH 2/4] fix: review --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 24 ++++++++++---------- clang/lib/AST/ExprConstant.cpp | 28 ++++++++++++++---------- 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 26cbff74f2b84..fa5c05ac642be 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2605,9 +2605,9 @@ static bool interp__builtin_elementwise_int_binop( return true; } -static bool interp__builtin_x86_pack( - InterpState &S, CodePtr, const CallExpr *E, - llvm::function_ref<APSInt(const APSInt &)> narrowElement) { +static bool +interp__builtin_x86_pack(InterpState &S, CodePtr, const CallExpr *E, + llvm::function_ref<APInt(const APSInt &)> PackFn) { const auto *VT0 = E->getArg(0)->getType()->castAs<VectorType>(); const auto *VT1 = E->getArg(1)->getType()->castAs<VectorType>(); assert(VT0 && VT1 && "pack builtin VT0 and VT1 must be VectorType"); @@ -2628,6 +2628,7 @@ static bool interp__builtin_x86_pack( PrimType SrcT = *S.getContext().classify(VT0->getElementType()); PrimType DstT = *S.getContext().classify(getElemType(Dst)); + const bool IsUnsigend = getElemType(Dst)->isUnsignedIntegerType(); for (unsigned Lane = 0; Lane != Lanes; ++Lane) { const unsigned BaseSrc = Lane * SrcPerLane; @@ -2637,11 +2638,11 @@ static bool interp__builtin_x86_pack( INT_TYPE_SWITCH_NO_BOOL(SrcT, { APSInt A = LHS.elem<T>(BaseSrc + I).toAPSInt(); APSInt B = RHS.elem<T>(BaseSrc + I).toAPSInt(); - APSInt AO = narrowElement(A); - APSInt BO = narrowElement(B); - assignInteger(S, Dst.atIndex(BaseDst + I), DstT, AO); - assignInteger(S, Dst.atIndex(BaseDst + SrcPerLane + I), DstT, BO); + assignInteger(S, Dst.atIndex(BaseDst + I), DstT, + APSInt(PackFn(A), IsUnsigend)); + assignInteger(S, Dst.atIndex(BaseDst + SrcPerLane + I), DstT, + APSInt(PackFn(B), IsUnsigend)); }); } } @@ -3530,8 +3531,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case clang::X86::BI__builtin_ia32_packssdw256: case clang::X86::BI__builtin_ia32_packssdw512: return interp__builtin_x86_pack(S, OpPC, Call, [](const APSInt &Src) { - APInt Value = APSInt(Src).truncSSat(Src.getBitWidth() / 2); - return APSInt(Value, /*isUnsigned=*/false); + return APInt(Src).truncSSat(Src.getBitWidth() / 2); }); case clang::X86::BI__builtin_ia32_packusdw128: case clang::X86::BI__builtin_ia32_packusdw256: @@ -3542,10 +3542,10 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return interp__builtin_x86_pack(S, OpPC, Call, [](const APSInt &Src) { unsigned DstBits = Src.getBitWidth() / 2; if (Src.isNegative()) - return APSInt(APInt::getZero(DstBits), /*isUnsigned=*/true); + return APInt::getZero(DstBits); if (Src.isIntN(DstBits)) - return APSInt(Src.trunc(DstBits), /*isUnsigned=*/true); - return APSInt(APInt::getAllOnes(DstBits), /*isUnsigned=*/true); + return APInt(Src).trunc(DstBits); + return APInt::getAllOnes(DstBits); }); case clang::X86::BI__builtin_ia32_vprotbi: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index b4a353f059f16..86b2d77487b95 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -55,6 +55,7 @@ #include "clang/Basic/TargetBuiltins.h" #include "clang/Basic/TargetInfo.h" #include "llvm/ADT/APFixedPoint.h" +#include "llvm/ADT/APSInt.h" #include "llvm/ADT/Sequence.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/StringExtras.h" @@ -11575,9 +11576,8 @@ static bool handleVectorElementCast(EvalInfo &Info, const FPOptions FPO, return false; } -static bool -evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result, - llvm::function_ref<APSInt(const APSInt &)> narrowElement) { +static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result, + llvm::function_ref<APInt(const APSInt &)> PackFn) { APValue LHS, RHS; if (!EvaluateAsRValue(Info, E->getArg(0), LHS) || !EvaluateAsRValue(Info, E->getArg(1), RHS)) @@ -11591,6 +11591,11 @@ evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result, const VectorType *VT0 = E->getArg(0)->getType()->castAs<VectorType>(); const unsigned SrcBits = Info.Ctx.getIntWidth(VT0->getElementType()); + + const VectorType *DstVT = E->getType()->castAs<VectorType>(); + QualType DstElemTy = DstVT->getElementType(); + const bool DstIsUnsigned = DstElemTy->isUnsignedIntegerType(); + const unsigned VectorBits = LHSVecLen * SrcBits; const unsigned srcPerLane = VectorBits >= 128 ? 128 / SrcBits : LHSVecLen; const unsigned lanes = VectorBits >= 128 ? VectorBits / 128 : 1; @@ -11601,11 +11606,11 @@ evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result, for (unsigned lane = 0; lane != lanes; ++lane) { unsigned base = lane * srcPerLane; for (unsigned i = 0; i != srcPerLane; ++i) - Out.emplace_back( - APValue(narrowElement(LHS.getVectorElt(base + i).getInt()))); + Out.emplace_back(APValue( + APSInt(PackFn(LHS.getVectorElt(base + i).getInt()), DstIsUnsigned))); for (unsigned i = 0; i != srcPerLane; ++i) - Out.emplace_back( - APValue(narrowElement(RHS.getVectorElt(base + i).getInt()))); + Out.emplace_back(APValue( + APSInt(PackFn(RHS.getVectorElt(base + i).getInt()), DstIsUnsigned))); } Result = APValue(Out.data(), Out.size()); @@ -11812,8 +11817,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { case X86::BI__builtin_ia32_packssdw256: case X86::BI__builtin_ia32_packssdw512: return evalPackBuiltin(E, Info, Result, [](const APSInt &Src) { - APInt Value = APSInt(Src).truncSSat(Src.getBitWidth() / 2); - return APSInt(Value, /*isUnsigned=*/false); + return APSInt(Src).truncSSat(Src.getBitWidth() / 2); }); case X86::BI__builtin_ia32_packusdw128: case X86::BI__builtin_ia32_packusdw256: @@ -11824,10 +11828,10 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return evalPackBuiltin(E, Info, Result, [](const APSInt &Src) { unsigned DstBits = Src.getBitWidth() / 2; if (Src.isNegative()) - return APSInt(APInt::getZero(DstBits), /*isUnsigned=*/true); + return APInt::getZero(DstBits); if (Src.isIntN(DstBits)) - return APSInt(Src.trunc(DstBits), /*isUnsigned=*/true); - return APSInt(APInt::getAllOnes(DstBits), /*isUnsigned=*/true); + return APInt((Src).trunc(DstBits)); + return APInt::getAllOnes(DstBits); }); case clang::X86::BI__builtin_ia32_pmuldq128: case clang::X86::BI__builtin_ia32_pmuldq256: >From 3b3c325c0724a7a07a77d0ce8b90693ac4417c83 Mon Sep 17 00:00:00 2001 From: woruyu <[email protected]> Date: Sun, 21 Sep 2025 18:36:29 -0900 Subject: [PATCH 3/4] fix: unnecessary include --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index fa5c05ac642be..bd7fdbd2035f6 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "../ExprConstShared.h" #include "Boolean.h" -#include "ByteCode/FixedPoint.h" #include "EvalEmitter.h" #include "Interp.h" #include "InterpBuiltinBitCast.h" >From 83e4b0f282dd85c8a2dc21d9ae75777f8654b9b3 Mon Sep 17 00:00:00 2001 From: woruyu <[email protected]> Date: Tue, 23 Sep 2025 18:10:31 -0900 Subject: [PATCH 4/4] fix: review --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 5 ++--- clang/lib/AST/ExprConstant.cpp | 6 ++---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index bd7fdbd2035f6..2cdbfa7a08502 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2621,9 +2621,8 @@ interp__builtin_x86_pack(InterpState &S, CodePtr, const CallExpr *E, const ASTContext &ASTCtx = S.getASTContext(); const unsigned SrcBits = ASTCtx.getIntWidth(VT0->getElementType()); const unsigned LHSVecLen = VT0->getNumElements(); - const unsigned VectorBits = LHSVecLen * SrcBits; - const unsigned SrcPerLane = VectorBits >= 128 ? (128 / SrcBits) : LHSVecLen; - const unsigned Lanes = VectorBits >= 128 ? (VectorBits / 128) : 1; + const unsigned SrcPerLane = 128 / SrcBits; + const unsigned Lanes = LHSVecLen * SrcBits / 128; PrimType SrcT = *S.getContext().classify(VT0->getElementType()); PrimType DstT = *S.getContext().classify(getElemType(Dst)); diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 86b2d77487b95..6c4184c78123a 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -55,7 +55,6 @@ #include "clang/Basic/TargetBuiltins.h" #include "clang/Basic/TargetInfo.h" #include "llvm/ADT/APFixedPoint.h" -#include "llvm/ADT/APSInt.h" #include "llvm/ADT/Sequence.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/StringExtras.h" @@ -11596,9 +11595,8 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result, QualType DstElemTy = DstVT->getElementType(); const bool DstIsUnsigned = DstElemTy->isUnsignedIntegerType(); - const unsigned VectorBits = LHSVecLen * SrcBits; - const unsigned srcPerLane = VectorBits >= 128 ? 128 / SrcBits : LHSVecLen; - const unsigned lanes = VectorBits >= 128 ? VectorBits / 128 : 1; + const unsigned srcPerLane = 128 / SrcBits; + const unsigned lanes = LHSVecLen * SrcBits / 128; SmallVector<APValue, 64> Out; Out.reserve(LHSVecLen + RHSVecLen); _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
