https://github.com/pierluigilenoci updated https://github.com/llvm/llvm-project/pull/188887
>From fa3f5ac7567fde45327eeaa6fa429bcfd4150592 Mon Sep 17 00:00:00 2001 From: Pierluigi Lenoci <[email protected]> Date: Fri, 27 Mar 2026 01:43:36 +0100 Subject: [PATCH 1/5] [Clang] Allow VDBPSADBW intrinsics to be used in constexpr Add constexpr evaluation support for the VDBPSADBW (Double Block Packed Sum-Absolute-Differences) intrinsics (__builtin_ia32_dbpsadbw128/256/512) in both the tree-based constant evaluator (ExprConstant.cpp) and the bytecode constexpr interpreter (InterpBuiltin.cpp). The VDBPSADBW instruction computes the sum of absolute differences of groups of 4 unsigned bytes from the second source against two 4-byte reference blocks selected from the first source by the immediate operand. Per 128-bit lane, imm8[1:0] selects blockA and imm8[3:2] selects blockB from the first source. For each group of 4 bytes in the second source, two SAD values are computed (one against each block), producing 8 result words per 128-bit lane. Care is taken to treat input bytes as unsigned (the builtin signature uses signed char vectors) by extracting via getZExtValue() and casting to uint8_t before computing absolute differences. Fixes #188747 Signed-off-by: Pierluigi Lenoci <[email protected]> --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 64 ++++++++++++++++++++ clang/lib/AST/ExprConstant.cpp | 57 +++++++++++++++++ clang/test/CodeGen/X86/avx512bw-builtins.c | 23 +++++-- clang/test/CodeGen/X86/avx512vlbw-builtins.c | 58 +++++++++++++++--- 4 files changed, 190 insertions(+), 12 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 214013396e885..d9a14f84e4a8a 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2820,6 +2820,65 @@ static bool interp__builtin_ia32_pmul( return true; } +static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC, + const CallExpr *Call) { + assert(Call->getNumArgs() == 3); + QualType Arg2Type = Call->getArg(2)->getType(); + APSInt ImmVal = popToAPSInt(S, Arg2Type); + unsigned Imm = ImmVal.getZExtValue(); + + const Pointer &Src2 = S.Stk.pop<Pointer>(); + const Pointer &Src1 = S.Stk.pop<Pointer>(); + const Pointer &Dst = S.Stk.peek<Pointer>(); + + const auto *SrcVT = Call->getArg(0)->getType()->castAs<VectorType>(); + PrimType SrcElemT = *S.getContext().classify(SrcVT->getElementType()); + unsigned SourceLen = SrcVT->getNumElements(); + + const auto *DestVT = Call->getType()->castAs<VectorType>(); + PrimType DestElemT = *S.getContext().classify(DestVT->getElementType()); + bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType(); + + unsigned LaneSize = 16; // 128-bit lane = 16 bytes + unsigned NumLanes = SourceLen / LaneSize; + unsigned BlockOffsetA = (Imm & 0x3) * 4; + unsigned BlockOffsetB = ((Imm >> 2) & 0x3) * 4; + + unsigned DstIdx = 0; + for (unsigned Lane = 0; Lane < NumLanes; ++Lane) { + unsigned LaneStart = Lane * LaneSize; + + for (unsigned J = 0; J < 4; ++J) { + unsigned SadA = 0; + unsigned SadB = 0; + for (unsigned K = 0; K < 4; ++K) { + unsigned A1Val, A2Val, BVal; + INT_TYPE_SWITCH_NO_BOOL(SrcElemT, { + // Treat as unsigned bytes + A1Val = static_cast<uint8_t>( + Src1.elem<T>(LaneStart + BlockOffsetA + K).toAPSInt().getZExtValue()); + A2Val = static_cast<uint8_t>( + Src1.elem<T>(LaneStart + BlockOffsetB + K).toAPSInt().getZExtValue()); + BVal = static_cast<uint8_t>( + Src2.elem<T>(LaneStart + 4 * J + K).toAPSInt().getZExtValue()); + }); + SadA += (BVal > A1Val) ? (BVal - A1Val) : (A1Val - BVal); + SadB += (BVal > A2Val) ? (BVal - A2Val) : (A2Val - BVal); + } + INT_TYPE_SWITCH_NO_BOOL(DestElemT, { + Dst.elem<T>(DstIdx) = + static_cast<T>(APSInt(APInt(16, SadA), DestUnsigned)); + Dst.elem<T>(DstIdx + 1) = + static_cast<T>(APSInt(APInt(16, SadB), DestUnsigned)); + }); + DstIdx += 2; + } + } + + Dst.initializeAllElements(); + return true; +} + static bool interp_builtin_horizontal_int_binop( InterpState &S, CodePtr OpPC, const CallExpr *Call, llvm::function_ref<APInt(const APSInt &, const APSInt &)> Fn) { @@ -4861,6 +4920,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, (HiLHS.sext(BitWidth) * HiRHS.sext(BitWidth)); }); + case clang::X86::BI__builtin_ia32_dbpsadbw128: + case clang::X86::BI__builtin_ia32_dbpsadbw256: + case clang::X86::BI__builtin_ia32_dbpsadbw512: + return interp__builtin_ia32_dbpsadbw(S, OpPC, Call); + case clang::X86::BI__builtin_ia32_pmulhuw128: case clang::X86::BI__builtin_ia32_pmulhuw256: case clang::X86::BI__builtin_ia32_pmulhuw512: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 4f45fa728c605..fc4a4834b462a 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12564,6 +12564,63 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + case clang::X86::BI__builtin_ia32_dbpsadbw128: + case clang::X86::BI__builtin_ia32_dbpsadbw256: + case clang::X86::BI__builtin_ia32_dbpsadbw512: { + APValue SourceA, SourceB, SourceImm; + if (!EvaluateAsRValue(Info, E->getArg(0), SourceA) || + !EvaluateAsRValue(Info, E->getArg(1), SourceB) || + !EvaluateAsRValue(Info, E->getArg(2), SourceImm)) + return false; + + unsigned SourceLen = SourceA.getVectorLength(); + unsigned LaneSize = 16; // 128-bit lane = 16 bytes + unsigned NumLanes = SourceLen / LaneSize; + unsigned Imm = SourceImm.getInt().getZExtValue(); + unsigned BlockOffsetA = (Imm & 0x3) * 4; + unsigned BlockOffsetB = ((Imm >> 2) & 0x3) * 4; + + auto *DestTy = E->getType()->castAs<VectorType>(); + QualType DestEltTy = DestTy->getElementType(); + bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType(); + SmallVector<APValue, 32> ResultElements; + ResultElements.reserve(SourceLen / 2); + + for (unsigned Lane = 0; Lane < NumLanes; ++Lane) { + unsigned LaneStart = Lane * LaneSize; + + for (unsigned J = 0; J < 4; ++J) { + // Compute SAD of SourceB[4*J..4*J+3] vs blockA from SourceA + unsigned SadA = 0; + unsigned SadB = 0; + for (unsigned K = 0; K < 4; ++K) { + // Treat input bytes as unsigned + unsigned A = static_cast<uint8_t>( + SourceA.getVectorElt(LaneStart + BlockOffsetA + K) + .getInt() + .getZExtValue()); + unsigned B = static_cast<uint8_t>( + SourceB.getVectorElt(LaneStart + 4 * J + K) + .getInt() + .getZExtValue()); + SadA += (B > A) ? (B - A) : (A - B); + + unsigned A2 = static_cast<uint8_t>( + SourceA.getVectorElt(LaneStart + BlockOffsetB + K) + .getInt() + .getZExtValue()); + SadB += (B > A2) ? (B - A2) : (A2 - B); + } + ResultElements.push_back( + APValue(APSInt(APInt(16, SadA), DestUnsigned))); + ResultElements.push_back( + APValue(APSInt(APInt(16, SadB), DestUnsigned))); + } + } + + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } + case clang::X86::BI__builtin_ia32_pmulhuw128: case clang::X86::BI__builtin_ia32_pmulhuw256: case clang::X86::BI__builtin_ia32_pmulhuw512: diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c index 2020b72a649ae..488146e740db4 100644 --- a/clang/test/CodeGen/X86/avx512bw-builtins.c +++ b/clang/test/CodeGen/X86/avx512bw-builtins.c @@ -3230,21 +3230,36 @@ TEST_CONSTEXPR(match_v64qi(_mm512_maskz_alignr_epi8((__mmask64)0x000000000000000 __m512i test_mm512_mm_dbsad_epu8(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mm_dbsad_epu8 // CHECK: @llvm.x86.avx512.dbpsadbw.512 - return _mm512_dbsad_epu8(__A, __B, 170); -} + return _mm512_dbsad_epu8(__A, __B, 170); +} +// 512-bit: 4 lanes, imm8=0: blockA=blockB=lane[0..3] for each lane +// Each lane behaves the same as the 128-bit case with matching data +TEST_CONSTEXPR(match_v32hu(_mm512_dbsad_epu8( + ((__m512i)(__v64qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}), + ((__m512i)(__v64qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), + 0), 4, 4, 20, 20, 36, 36, 52, 52, + 4, 4, 20, 20, 36, 36, 52, 52, + 4, 4, 20, 20, 36, 36, 52, 52, + 4, 4, 20, 20, 36, 36, 52, 52)); __m512i test_mm512_mm_mask_dbsad_epu8(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mm_mask_dbsad_epu8 // CHECK: @llvm.x86.avx512.dbpsadbw.512 //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} - return _mm512_mask_dbsad_epu8(__W, __U, __A, __B, 170); + return _mm512_mask_dbsad_epu8(__W, __U, __A, __B, 170); } __m512i test_mm512_mm_maskz_dbsad_epu8(__mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mm_maskz_dbsad_epu8 // CHECK: @llvm.x86.avx512.dbpsadbw.512 //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} - return _mm512_maskz_dbsad_epu8(__U, __A, __B, 170); + return _mm512_maskz_dbsad_epu8(__U, __A, __B, 170); } __m512i test_mm512_sad_epu8(__m512i __A, __m512i __B) { diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c index 0ee14909ae805..098ee29b1989e 100644 --- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c @@ -3676,41 +3676,83 @@ TEST_CONSTEXPR(match_v32qi(_mm256_maskz_alignr_epi8((__mmask32)0xf000000f, ((__m __m128i test_mm_dbsad_epu8(__m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dbsad_epu8 // CHECK: @llvm.x86.avx512.dbpsadbw.128 - return _mm_dbsad_epu8(__A, __B, 170); -} + return _mm_dbsad_epu8(__A, __B, 170); +} +// imm8=4: blockA=A[0..3]={0,1,2,3}, blockB=A[4..7]={4,5,6,7} +// J=0: B[0..3]={1,2,3,4} vs blockA=4, vs blockB=12 +// J=1: B[4..7]={5,6,7,8} vs blockA=20, vs blockB=4 +// J=2: B[8..11]={9,10,11,12} vs blockA=36, vs blockB=20 +// J=3: B[12..15]={13,14,15,16} vs blockA=52, vs blockB=36 +TEST_CONSTEXPR(match_v8hu(_mm_dbsad_epu8( + ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}), + ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), + 4), 4, 12, 20, 4, 36, 20, 52, 36)); +// imm8=0: blockA=blockB=A[0..3]={0,1,2,3} +TEST_CONSTEXPR(match_v8hu(_mm_dbsad_epu8( + ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}), + ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), + 0), 4, 4, 20, 20, 36, 36, 52, 52)); +// Test with unsigned values > 127 (signed overflow territory) +// A[0..3]={200,100,50,25}, B[0..3]={180,120,40,30} +// imm8=0: blockA=blockB=A[0..3] +// SAD = |180-200|+|120-100|+|40-50|+|30-25| = 20+20+10+5 = 55 +TEST_CONSTEXPR(match_v8hu(_mm_dbsad_epu8( + ((__m128i)(__v16qu){200, 100, 50, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}), + ((__m128i)(__v16qu){180, 120, 40, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}), + 0), 55, 55, 375, 375, 375, 375, 375, 375)); __m128i test_mm_mask_dbsad_epu8(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_dbsad_epu8 // CHECK: @llvm.x86.avx512.dbpsadbw.128 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} - return _mm_mask_dbsad_epu8(__W, __U, __A, __B, 170); + return _mm_mask_dbsad_epu8(__W, __U, __A, __B, 170); } +// Test masked version: mask=0x55 (keep even elements, passthrough odd) +TEST_CONSTEXPR(match_v8hu(_mm_mask_dbsad_epu8( + ((__m128i)(__v8hu){99, 99, 99, 99, 99, 99, 99, 99}), (__mmask8)0x55, + ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}), + ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), + 4), 4, 99, 20, 99, 36, 99, 52, 99)); __m128i test_mm_maskz_dbsad_epu8(__mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_dbsad_epu8 // CHECK: @llvm.x86.avx512.dbpsadbw.128 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} - return _mm_maskz_dbsad_epu8(__U, __A, __B, 170); + return _mm_maskz_dbsad_epu8(__U, __A, __B, 170); } +// Test zero-masked version: mask=0xAA (keep odd elements, zero even) +TEST_CONSTEXPR(match_v8hu(_mm_maskz_dbsad_epu8((__mmask8)0xAA, + ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}), + ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), + 4), 0, 12, 0, 4, 0, 20, 0, 36)); __m256i test_mm256_dbsad_epu8(__m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dbsad_epu8 // CHECK: @llvm.x86.avx512.dbpsadbw.256 - return _mm256_dbsad_epu8(__A, __B, 170); -} + return _mm256_dbsad_epu8(__A, __B, 170); +} +// 256-bit: 2 lanes, imm8=0: blockA=blockB=lane[0..3] +// Lane 0: same as 128-bit test above +// Lane 1: A[16..19]={16,17,18,19}, B[16..19]={17,18,19,20} -> SAD=4 +TEST_CONSTEXPR(match_v16hu(_mm256_dbsad_epu8( + ((__m256i)(__v32qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}), + ((__m256i)(__v32qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), + 0), 4, 4, 20, 20, 36, 36, 52, 52, 4, 4, 20, 20, 36, 36, 52, 52)); __m256i test_mm256_mask_dbsad_epu8(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_dbsad_epu8 // CHECK: @llvm.x86.avx512.dbpsadbw.256 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} - return _mm256_mask_dbsad_epu8(__W, __U, __A, __B, 170); + return _mm256_mask_dbsad_epu8(__W, __U, __A, __B, 170); } __m256i test_mm256_maskz_dbsad_epu8(__mmask16 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_dbsad_epu8 // CHECK: @llvm.x86.avx512.dbpsadbw.256 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} - return _mm256_maskz_dbsad_epu8(__U, __A, __B, 170); + return _mm256_maskz_dbsad_epu8(__U, __A, __B, 170); } __mmask8 test_mm_movepi16_mask(__m128i __A) { // CHECK-LABEL: test_mm_movepi16_mask >From 8b292ead3eb31c002f6deb2e7179b1b208f6076d Mon Sep 17 00:00:00 2001 From: Pierluigi Lenoci <[email protected]> Date: Fri, 27 Mar 2026 18:43:08 +0100 Subject: [PATCH 2/5] fix: address reviewer feedback for constexpr VDBPSADBW - Add Constexpr tags to BuiltinsX86.td for VDBPSADBW builtins - Update InterpBuiltin.cpp per tbaederr's suggestions: - Use popToUInt64 instead of popToAPSInt for immediate value - Use != instead of < in loop comparison - Simplify element access by removing unnecessary toAPSInt().getZExtValue() - Apply clang-format fix in ExprConstant.cpp Signed-off-by: Pierluigi Lenoci <[email protected]> --- clang/include/clang/Basic/BuiltinsX86.td | 6 +++--- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 12 +++++------- clang/lib/AST/ExprConstant.cpp | 8 ++++---- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index f47532a63de04..e54f8d66843bf 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -3197,15 +3197,15 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVect def palignr512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant int)">; } -let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def dbpsadbw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>, _Constant int)">; } -let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def dbpsadbw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>, _Constant int)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def dbpsadbw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>, _Constant int)">; def psadbw512 : X86Builtin<"_Vector<8, long long int>(_Vector<64, char>, _Vector<64, char>)">; } diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index d9a14f84e4a8a..5d46b2c595b1f 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2823,9 +2823,7 @@ static bool interp__builtin_ia32_pmul( static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC, const CallExpr *Call) { assert(Call->getNumArgs() == 3); - QualType Arg2Type = Call->getArg(2)->getType(); - APSInt ImmVal = popToAPSInt(S, Arg2Type); - unsigned Imm = ImmVal.getZExtValue(); + unsigned Imm = popToUInt64(S, Call->getArg(2)); const Pointer &Src2 = S.Stk.pop<Pointer>(); const Pointer &Src1 = S.Stk.pop<Pointer>(); @@ -2845,7 +2843,7 @@ static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC, unsigned BlockOffsetB = ((Imm >> 2) & 0x3) * 4; unsigned DstIdx = 0; - for (unsigned Lane = 0; Lane < NumLanes; ++Lane) { + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { unsigned LaneStart = Lane * LaneSize; for (unsigned J = 0; J < 4; ++J) { @@ -2856,11 +2854,11 @@ static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC, INT_TYPE_SWITCH_NO_BOOL(SrcElemT, { // Treat as unsigned bytes A1Val = static_cast<uint8_t>( - Src1.elem<T>(LaneStart + BlockOffsetA + K).toAPSInt().getZExtValue()); + Src1.elem<T>(LaneStart + BlockOffsetA + K)); A2Val = static_cast<uint8_t>( - Src1.elem<T>(LaneStart + BlockOffsetB + K).toAPSInt().getZExtValue()); + Src1.elem<T>(LaneStart + BlockOffsetB + K)); BVal = static_cast<uint8_t>( - Src2.elem<T>(LaneStart + 4 * J + K).toAPSInt().getZExtValue()); + Src2.elem<T>(LaneStart + 4 * J + K)); }); SadA += (BVal > A1Val) ? (BVal - A1Val) : (A1Val - BVal); SadB += (BVal > A2Val) ? (BVal - A2Val) : (A2Val - BVal); diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index fc4a4834b462a..2a6e1713fba4d 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12599,10 +12599,10 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { SourceA.getVectorElt(LaneStart + BlockOffsetA + K) .getInt() .getZExtValue()); - unsigned B = static_cast<uint8_t>( - SourceB.getVectorElt(LaneStart + 4 * J + K) - .getInt() - .getZExtValue()); + unsigned B = + static_cast<uint8_t>(SourceB.getVectorElt(LaneStart + 4 * J + K) + .getInt() + .getZExtValue()); SadA += (B > A) ? (B - A) : (A - B); unsigned A2 = static_cast<uint8_t>( >From b2ef04823423b38720746a63a0a661a54852de90 Mon Sep 17 00:00:00 2001 From: Pierluigi Lenoci <[email protected]> Date: Sat, 28 Mar 2026 16:50:42 +0100 Subject: [PATCH 3/5] style: apply clang-format to modified files Signed-off-by: Pierluigi Lenoci <[email protected]> --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 5d46b2c595b1f..4ba611cf68013 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2853,12 +2853,11 @@ static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC, unsigned A1Val, A2Val, BVal; INT_TYPE_SWITCH_NO_BOOL(SrcElemT, { // Treat as unsigned bytes - A1Val = static_cast<uint8_t>( - Src1.elem<T>(LaneStart + BlockOffsetA + K)); - A2Val = static_cast<uint8_t>( - Src1.elem<T>(LaneStart + BlockOffsetB + K)); - BVal = static_cast<uint8_t>( - Src2.elem<T>(LaneStart + 4 * J + K)); + A1Val = + static_cast<uint8_t>(Src1.elem<T>(LaneStart + BlockOffsetA + K)); + A2Val = + static_cast<uint8_t>(Src1.elem<T>(LaneStart + BlockOffsetB + K)); + BVal = static_cast<uint8_t>(Src2.elem<T>(LaneStart + 4 * J + K)); }); SadA += (BVal > A1Val) ? (BVal - A1Val) : (A1Val - BVal); SadB += (BVal > A2Val) ? (BVal - A2Val) : (A2Val - BVal); >From bea99a2dc693330aeacf0c7a386bee072221cd5c Mon Sep 17 00:00:00 2001 From: Pierluigi Lenoci <[email protected]> Date: Sat, 28 Mar 2026 22:35:25 +0100 Subject: [PATCH 4/5] fix: address review feedback - split psadbw512 from constexpr block Split psadbw512 out of the shared BuiltinsX86.td let-block with dbpsadbw512 to avoid erroneously marking psadbw512 as Constexpr. The psadbw512 builtin does not have constexpr evaluation support, so it should not be tagged with Constexpr. This addresses RKSimon's review feedback about missing/incorrect Constexpr tags. The three code suggestions from tbaederr were already addressed in a prior commit: - Use popToUInt64 instead of popToAPSInt for the immediate value - Use != instead of < in the lane loop comparison - Remove unnecessary .toAPSInt().getZExtValue() from element access Signed-off-by: Pierluigi Lenoci <[email protected]> --- clang/include/clang/Basic/BuiltinsX86.td | 3 +++ 1 file changed, 3 insertions(+) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index e54f8d66843bf..59c79bf681103 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -3207,6 +3207,9 @@ let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, Req let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def dbpsadbw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>, _Constant int)">; +} + +let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { def psadbw512 : X86Builtin<"_Vector<8, long long int>(_Vector<64, char>, _Vector<64, char>)">; } >From c5f9446197ab98ad3a7415f0439c0b2bcc701b07 Mon Sep 17 00:00:00 2001 From: Pierluigi Lenoci <[email protected]> Date: Sun, 29 Mar 2026 23:21:18 +0200 Subject: [PATCH 5/5] fix: rewrite VDBPSADBW constexpr to match hardware behavior The previous implementation was fundamentally incorrect: it only used two 2-bit fields from imm8 to select two fixed blocks from src1, then computed a simple block-vs-block SAD. The actual VDBPSADBW instruction uses a two-phase algorithm: Phase 1 (Shuffle): All four 2-bit fields of imm8 are used to shuffle src2 within each 128-bit lane. Each field selects one of four 4-byte blocks from src2. Phase 2 (Sliding SAD): A sliding/overlapping window computes SADs between src1 bytes and the shuffled src2 bytes. Groups of 4 output u16 values are produced using overlapping offsets into both arrays. The correct algorithm matches GCC's reference implementation in gcc/testsuite/gcc.target/i386/avx512bw-vdbpsadbw-2.c and has been verified against hardware output provided by @RKSimon: _mm_dbsad_epu8([0..15], [1..16], 4) = [4, 8, 4, 0, 28, 28, 44, 44] Both ExprConstant.cpp and InterpBuiltin.cpp are updated with the same corrected algorithm. All TEST_CONSTEXPR expected values are recomputed to match. Signed-off-by: Pierluigi Lenoci <[email protected]> --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 61 +++++++++++-------- clang/lib/AST/ExprConstant.cpp | 62 +++++++++++--------- clang/test/CodeGen/X86/avx512bw-builtins.c | 11 ++-- clang/test/CodeGen/X86/avx512vlbw-builtins.c | 29 ++++----- 4 files changed, 87 insertions(+), 76 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 305559b3eb025..15b5d85947433 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2840,38 +2840,51 @@ static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC, PrimType DestElemT = *S.getContext().classify(DestVT->getElementType()); bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType(); - unsigned LaneSize = 16; // 128-bit lane = 16 bytes - unsigned NumLanes = SourceLen / LaneSize; - unsigned BlockOffsetA = (Imm & 0x3) * 4; - unsigned BlockOffsetB = ((Imm >> 2) & 0x3) * 4; - - unsigned DstIdx = 0; - for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { - unsigned LaneStart = Lane * LaneSize; + constexpr unsigned LaneSize = 16; // 128-bit lane = 16 bytes + // Phase 1: Shuffle Src2 using all four 2-bit fields of imm8. + // Within each 128-bit lane, for group j (0..3), select a 4-byte block + // from Src2 based on bits [2*j+1:2*j] of imm8. + uint8_t Shuffled[64]; // max 512-bit = 64 bytes + for (unsigned I = 0; I < SourceLen; I += LaneSize) { for (unsigned J = 0; J < 4; ++J) { - unsigned SadA = 0; - unsigned SadB = 0; + unsigned Part = (Imm >> (2 * J)) & 3; for (unsigned K = 0; K < 4; ++K) { - unsigned A1Val, A2Val, BVal; INT_TYPE_SWITCH_NO_BOOL(SrcElemT, { - // Treat as unsigned bytes - A1Val = - static_cast<uint8_t>(Src1.elem<T>(LaneStart + BlockOffsetA + K)); - A2Val = - static_cast<uint8_t>(Src1.elem<T>(LaneStart + BlockOffsetB + K)); - BVal = static_cast<uint8_t>(Src2.elem<T>(LaneStart + 4 * J + K)); + Shuffled[I + 4 * J + K] = + static_cast<uint8_t>(Src2.elem<T>(I + 4 * Part + K)); }); - SadA += (BVal > A1Val) ? (BVal - A1Val) : (A1Val - BVal); - SadB += (BVal > A2Val) ? (BVal - A2Val) : (A2Val - BVal); } + } + } + + // Phase 2: Sliding SAD computation. + // For every group of 4 output u16 values, compute absolute differences + // using overlapping windows into Src1 and the shuffled array. + unsigned Size = SourceLen / 2; // number of output u16 elements + unsigned DstIdx = 0; + for (unsigned I = 0; I < Size; I += 4) { + unsigned Sad[4] = {0, 0, 0, 0}; + for (unsigned J = 0; J < 4; ++J) { + uint8_t A1, A2; + INT_TYPE_SWITCH_NO_BOOL(SrcElemT, { + A1 = static_cast<uint8_t>(Src1.elem<T>(2 * I + J)); + A2 = static_cast<uint8_t>(Src1.elem<T>(2 * I + J + 4)); + }); + uint8_t B0 = Shuffled[2 * I + J]; + uint8_t B1 = Shuffled[2 * I + J + 1]; + uint8_t B2 = Shuffled[2 * I + J + 2]; + uint8_t B3 = Shuffled[2 * I + J + 3]; + Sad[0] += (A1 > B0) ? (A1 - B0) : (B0 - A1); + Sad[1] += (A1 > B1) ? (A1 - B1) : (B1 - A1); + Sad[2] += (A2 > B2) ? (A2 - B2) : (B2 - A2); + Sad[3] += (A2 > B3) ? (A2 - B3) : (B3 - A2); + } + for (unsigned R = 0; R < 4; ++R) { INT_TYPE_SWITCH_NO_BOOL(DestElemT, { - Dst.elem<T>(DstIdx) = - static_cast<T>(APSInt(APInt(16, SadA), DestUnsigned)); - Dst.elem<T>(DstIdx + 1) = - static_cast<T>(APSInt(APInt(16, SadB), DestUnsigned)); + Dst.elem<T>(DstIdx++) = + static_cast<T>(APSInt(APInt(16, Sad[R]), DestUnsigned)); }); - DstIdx += 2; } } diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 2a6e1713fba4d..cc8b3d8e61b93 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12574,11 +12574,8 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return false; unsigned SourceLen = SourceA.getVectorLength(); - unsigned LaneSize = 16; // 128-bit lane = 16 bytes - unsigned NumLanes = SourceLen / LaneSize; + constexpr unsigned LaneSize = 16; // 128-bit lane = 16 bytes unsigned Imm = SourceImm.getInt().getZExtValue(); - unsigned BlockOffsetA = (Imm & 0x3) * 4; - unsigned BlockOffsetB = ((Imm >> 2) & 0x3) * 4; auto *DestTy = E->getType()->castAs<VectorType>(); QualType DestEltTy = DestTy->getElementType(); @@ -12586,38 +12583,47 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { SmallVector<APValue, 32> ResultElements; ResultElements.reserve(SourceLen / 2); - for (unsigned Lane = 0; Lane < NumLanes; ++Lane) { - unsigned LaneStart = Lane * LaneSize; - + // Phase 1: Shuffle SourceB using all four 2-bit fields of imm8. + // Within each 128-bit lane, for group j (0..3), select a 4-byte block + // from SourceB based on bits [2*j+1:2*j] of imm8. + SmallVector<uint8_t, 64> Shuffled(SourceLen); + for (unsigned I = 0; I < SourceLen; I += LaneSize) { for (unsigned J = 0; J < 4; ++J) { - // Compute SAD of SourceB[4*J..4*J+3] vs blockA from SourceA - unsigned SadA = 0; - unsigned SadB = 0; + unsigned Part = (Imm >> (2 * J)) & 3; for (unsigned K = 0; K < 4; ++K) { - // Treat input bytes as unsigned - unsigned A = static_cast<uint8_t>( - SourceA.getVectorElt(LaneStart + BlockOffsetA + K) - .getInt() - .getZExtValue()); - unsigned B = - static_cast<uint8_t>(SourceB.getVectorElt(LaneStart + 4 * J + K) - .getInt() - .getZExtValue()); - SadA += (B > A) ? (B - A) : (A - B); - - unsigned A2 = static_cast<uint8_t>( - SourceA.getVectorElt(LaneStart + BlockOffsetB + K) + Shuffled[I + 4 * J + K] = static_cast<uint8_t>( + SourceB.getVectorElt(I + 4 * Part + K) .getInt() .getZExtValue()); - SadB += (B > A2) ? (B - A2) : (A2 - B); } - ResultElements.push_back( - APValue(APSInt(APInt(16, SadA), DestUnsigned))); - ResultElements.push_back( - APValue(APSInt(APInt(16, SadB), DestUnsigned))); } } + // Phase 2: Sliding SAD computation. + // For every group of 4 output u16 values, compute absolute differences + // using overlapping windows into SourceA and the shuffled array. + unsigned Size = SourceLen / 2; // number of output u16 elements + for (unsigned I = 0; I < Size; I += 4) { + unsigned Sad[4] = {0, 0, 0, 0}; + for (unsigned J = 0; J < 4; ++J) { + uint8_t A1 = static_cast<uint8_t>( + SourceA.getVectorElt(2 * I + J).getInt().getZExtValue()); + uint8_t A2 = static_cast<uint8_t>( + SourceA.getVectorElt(2 * I + J + 4).getInt().getZExtValue()); + uint8_t B0 = Shuffled[2 * I + J]; + uint8_t B1 = Shuffled[2 * I + J + 1]; + uint8_t B2 = Shuffled[2 * I + J + 2]; + uint8_t B3 = Shuffled[2 * I + J + 3]; + Sad[0] += (A1 > B0) ? (A1 - B0) : (B0 - A1); + Sad[1] += (A1 > B1) ? (A1 - B1) : (B1 - A1); + Sad[2] += (A2 > B2) ? (A2 - B2) : (B2 - A2); + Sad[3] += (A2 > B3) ? (A2 - B3) : (B3 - A2); + } + for (unsigned R = 0; R < 4; ++R) + ResultElements.push_back( + APValue(APSInt(APInt(16, Sad[R]), DestUnsigned))); + } + return Success(APValue(ResultElements.data(), ResultElements.size()), E); } diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c index 488146e740db4..b9fc4fa3f7ab9 100644 --- a/clang/test/CodeGen/X86/avx512bw-builtins.c +++ b/clang/test/CodeGen/X86/avx512bw-builtins.c @@ -3232,8 +3232,7 @@ __m512i test_mm512_mm_dbsad_epu8(__m512i __A, __m512i __B) { // CHECK: @llvm.x86.avx512.dbpsadbw.512 return _mm512_dbsad_epu8(__A, __B, 170); } -// 512-bit: 4 lanes, imm8=0: blockA=blockB=lane[0..3] for each lane -// Each lane behaves the same as the 128-bit case with matching data +// 512-bit: 4 lanes, imm8=0: all shuffle groups select block 0 per lane TEST_CONSTEXPR(match_v32hu(_mm512_dbsad_epu8( ((__m512i)(__v64qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -3243,10 +3242,10 @@ TEST_CONSTEXPR(match_v32hu(_mm512_dbsad_epu8( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), - 0), 4, 4, 20, 20, 36, 36, 52, 52, - 4, 4, 20, 20, 36, 36, 52, 52, - 4, 4, 20, 20, 36, 36, 52, 52, - 4, 4, 20, 20, 36, 36, 52, 52)); + 0), 4, 8, 12, 12, 28, 28, 44, 44, + 4, 8, 12, 12, 28, 28, 44, 44, + 4, 8, 12, 12, 28, 28, 44, 44, + 4, 8, 12, 12, 28, 28, 44, 44)); __m512i test_mm512_mm_mask_dbsad_epu8(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mm_mask_dbsad_epu8 diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c index 098ee29b1989e..2e148278a7cf2 100644 --- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c @@ -3678,28 +3678,23 @@ __m128i test_mm_dbsad_epu8(__m128i __A, __m128i __B) { // CHECK: @llvm.x86.avx512.dbpsadbw.128 return _mm_dbsad_epu8(__A, __B, 170); } -// imm8=4: blockA=A[0..3]={0,1,2,3}, blockB=A[4..7]={4,5,6,7} -// J=0: B[0..3]={1,2,3,4} vs blockA=4, vs blockB=12 -// J=1: B[4..7]={5,6,7,8} vs blockA=20, vs blockB=4 -// J=2: B[8..11]={9,10,11,12} vs blockA=36, vs blockB=20 -// J=3: B[12..15]={13,14,15,16} vs blockA=52, vs blockB=36 +// imm8=4 (0b00000100): shuffle selects src2 blocks [0,1,0,0] per lane +// Phase 1 builds tmp, Phase 2 computes sliding SADs TEST_CONSTEXPR(match_v8hu(_mm_dbsad_epu8( ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}), ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), - 4), 4, 12, 20, 4, 36, 20, 52, 36)); -// imm8=0: blockA=blockB=A[0..3]={0,1,2,3} + 4), 4, 8, 4, 0, 28, 28, 44, 44)); +// imm8=0: all four 2-bit fields select block 0 from src2 TEST_CONSTEXPR(match_v8hu(_mm_dbsad_epu8( ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}), ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), - 0), 4, 4, 20, 20, 36, 36, 52, 52)); + 0), 4, 8, 12, 12, 28, 28, 44, 44)); // Test with unsigned values > 127 (signed overflow territory) -// A[0..3]={200,100,50,25}, B[0..3]={180,120,40,30} -// imm8=0: blockA=blockB=A[0..3] -// SAD = |180-200|+|120-100|+|40-50|+|30-25| = 20+20+10+5 = 55 +// imm8=0: all shuffle groups select src2[0..3]={180,120,40,30} TEST_CONSTEXPR(match_v8hu(_mm_dbsad_epu8( ((__m128i)(__v16qu){200, 100, 50, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}), ((__m128i)(__v16qu){180, 120, 40, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}), - 0), 55, 55, 375, 375, 375, 375, 375, 375)); + 0), 55, 315, 370, 370, 370, 370, 370, 370)); __m128i test_mm_mask_dbsad_epu8(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_dbsad_epu8 @@ -3712,7 +3707,7 @@ TEST_CONSTEXPR(match_v8hu(_mm_mask_dbsad_epu8( ((__m128i)(__v8hu){99, 99, 99, 99, 99, 99, 99, 99}), (__mmask8)0x55, ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}), ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), - 4), 4, 99, 20, 99, 36, 99, 52, 99)); + 4), 4, 99, 4, 99, 28, 99, 44, 99)); __m128i test_mm_maskz_dbsad_epu8(__mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_dbsad_epu8 @@ -3724,22 +3719,20 @@ __m128i test_mm_maskz_dbsad_epu8(__mmask8 __U, __m128i __A, __m128i __B) { TEST_CONSTEXPR(match_v8hu(_mm_maskz_dbsad_epu8((__mmask8)0xAA, ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}), ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), - 4), 0, 12, 0, 4, 0, 20, 0, 36)); + 4), 0, 8, 0, 0, 0, 28, 0, 44)); __m256i test_mm256_dbsad_epu8(__m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dbsad_epu8 // CHECK: @llvm.x86.avx512.dbpsadbw.256 return _mm256_dbsad_epu8(__A, __B, 170); } -// 256-bit: 2 lanes, imm8=0: blockA=blockB=lane[0..3] -// Lane 0: same as 128-bit test above -// Lane 1: A[16..19]={16,17,18,19}, B[16..19]={17,18,19,20} -> SAD=4 +// 256-bit: 2 lanes, imm8=0: all shuffle groups select block 0 per lane TEST_CONSTEXPR(match_v16hu(_mm256_dbsad_epu8( ((__m256i)(__v32qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}), ((__m256i)(__v32qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), - 0), 4, 4, 20, 20, 36, 36, 52, 52, 4, 4, 20, 20, 36, 36, 52, 52)); + 0), 4, 8, 12, 12, 28, 28, 44, 44, 4, 8, 12, 12, 28, 28, 44, 44)); __m256i test_mm256_mask_dbsad_epu8(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_dbsad_epu8 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
