[clang] [Clang] Allow VDBPSADBW intrinsics in constexpr (PR #188887)

Pierluigi Lenoci via cfe-commits Sat, 28 Mar 2026 08:51:21 -0700

https://github.com/pierluigilenoci updated 
https://github.com/llvm/llvm-project/pull/188887


>From fa3f5ac7567fde45327eeaa6fa429bcfd4150592 Mon Sep 17 00:00:00 2001
From: Pierluigi Lenoci <[email protected]>
Date: Fri, 27 Mar 2026 01:43:36 +0100
Subject: [PATCH 1/3] [Clang] Allow VDBPSADBW intrinsics to be used in
 constexpr

Add constexpr evaluation support for the VDBPSADBW (Double Block Packed
Sum-Absolute-Differences) intrinsics (__builtin_ia32_dbpsadbw128/256/512)
in both the tree-based constant evaluator (ExprConstant.cpp) and the
bytecode constexpr interpreter (InterpBuiltin.cpp).

The VDBPSADBW instruction computes the sum of absolute differences of
groups of 4 unsigned bytes from the second source against two 4-byte
reference blocks selected from the first source by the immediate operand.
Per 128-bit lane, imm8[1:0] selects blockA and imm8[3:2] selects blockB
from the first source. For each group of 4 bytes in the second source,
two SAD values are computed (one against each block), producing 8 result
words per 128-bit lane.

Care is taken to treat input bytes as unsigned (the builtin signature
uses signed char vectors) by extracting via getZExtValue() and casting
to uint8_t before computing absolute differences.

Fixes #188747

Signed-off-by: Pierluigi Lenoci <[email protected]>
---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp     | 64 ++++++++++++++++++++
 clang/lib/AST/ExprConstant.cpp               | 57 +++++++++++++++++
 clang/test/CodeGen/X86/avx512bw-builtins.c   | 23 +++++--
 clang/test/CodeGen/X86/avx512vlbw-builtins.c | 58 +++++++++++++++---
 4 files changed, 190 insertions(+), 12 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp 
b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 214013396e885..d9a14f84e4a8a 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2820,6 +2820,65 @@ static bool interp__builtin_ia32_pmul(
   return true;
 }
 
+static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC,
+                                          const CallExpr *Call) {
+  assert(Call->getNumArgs() == 3);
+  QualType Arg2Type = Call->getArg(2)->getType();
+  APSInt ImmVal = popToAPSInt(S, Arg2Type);
+  unsigned Imm = ImmVal.getZExtValue();
+
+  const Pointer &Src2 = S.Stk.pop<Pointer>();
+  const Pointer &Src1 = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  const auto *SrcVT = Call->getArg(0)->getType()->castAs<VectorType>();
+  PrimType SrcElemT = *S.getContext().classify(SrcVT->getElementType());
+  unsigned SourceLen = SrcVT->getNumElements();
+
+  const auto *DestVT = Call->getType()->castAs<VectorType>();
+  PrimType DestElemT = *S.getContext().classify(DestVT->getElementType());
+  bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();
+
+  unsigned LaneSize = 16; // 128-bit lane = 16 bytes
+  unsigned NumLanes = SourceLen / LaneSize;
+  unsigned BlockOffsetA = (Imm & 0x3) * 4;
+  unsigned BlockOffsetB = ((Imm >> 2) & 0x3) * 4;
+
+  unsigned DstIdx = 0;
+  for (unsigned Lane = 0; Lane < NumLanes; ++Lane) {
+    unsigned LaneStart = Lane * LaneSize;
+
+    for (unsigned J = 0; J < 4; ++J) {
+      unsigned SadA = 0;
+      unsigned SadB = 0;
+      for (unsigned K = 0; K < 4; ++K) {
+        unsigned A1Val, A2Val, BVal;
+        INT_TYPE_SWITCH_NO_BOOL(SrcElemT, {
+          // Treat as unsigned bytes
+          A1Val = static_cast<uint8_t>(
+              Src1.elem<T>(LaneStart + BlockOffsetA + 
K).toAPSInt().getZExtValue());
+          A2Val = static_cast<uint8_t>(
+              Src1.elem<T>(LaneStart + BlockOffsetB + 
K).toAPSInt().getZExtValue());
+          BVal = static_cast<uint8_t>(
+              Src2.elem<T>(LaneStart + 4 * J + K).toAPSInt().getZExtValue());
+        });
+        SadA += (BVal > A1Val) ? (BVal - A1Val) : (A1Val - BVal);
+        SadB += (BVal > A2Val) ? (BVal - A2Val) : (A2Val - BVal);
+      }
+      INT_TYPE_SWITCH_NO_BOOL(DestElemT, {
+        Dst.elem<T>(DstIdx) =
+            static_cast<T>(APSInt(APInt(16, SadA), DestUnsigned));
+        Dst.elem<T>(DstIdx + 1) =
+            static_cast<T>(APSInt(APInt(16, SadB), DestUnsigned));
+      });
+      DstIdx += 2;
+    }
+  }
+
+  Dst.initializeAllElements();
+  return true;
+}
+
 static bool interp_builtin_horizontal_int_binop(
     InterpState &S, CodePtr OpPC, const CallExpr *Call,
     llvm::function_ref<APInt(const APSInt &, const APSInt &)> Fn) {
@@ -4861,6 +4920,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, 
const CallExpr *Call,
                  (HiLHS.sext(BitWidth) * HiRHS.sext(BitWidth));
         });
 
+  case clang::X86::BI__builtin_ia32_dbpsadbw128:
+  case clang::X86::BI__builtin_ia32_dbpsadbw256:
+  case clang::X86::BI__builtin_ia32_dbpsadbw512:
+    return interp__builtin_ia32_dbpsadbw(S, OpPC, Call);
+
   case clang::X86::BI__builtin_ia32_pmulhuw128:
   case clang::X86::BI__builtin_ia32_pmulhuw256:
   case clang::X86::BI__builtin_ia32_pmulhuw512:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 4f45fa728c605..fc4a4834b462a 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12564,6 +12564,63 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr 
*E) {
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
 
+  case clang::X86::BI__builtin_ia32_dbpsadbw128:
+  case clang::X86::BI__builtin_ia32_dbpsadbw256:
+  case clang::X86::BI__builtin_ia32_dbpsadbw512: {
+    APValue SourceA, SourceB, SourceImm;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceA) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceB) ||
+        !EvaluateAsRValue(Info, E->getArg(2), SourceImm))
+      return false;
+
+    unsigned SourceLen = SourceA.getVectorLength();
+    unsigned LaneSize = 16; // 128-bit lane = 16 bytes
+    unsigned NumLanes = SourceLen / LaneSize;
+    unsigned Imm = SourceImm.getInt().getZExtValue();
+    unsigned BlockOffsetA = (Imm & 0x3) * 4;
+    unsigned BlockOffsetB = ((Imm >> 2) & 0x3) * 4;
+
+    auto *DestTy = E->getType()->castAs<VectorType>();
+    QualType DestEltTy = DestTy->getElementType();
+    bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
+    SmallVector<APValue, 32> ResultElements;
+    ResultElements.reserve(SourceLen / 2);
+
+    for (unsigned Lane = 0; Lane < NumLanes; ++Lane) {
+      unsigned LaneStart = Lane * LaneSize;
+
+      for (unsigned J = 0; J < 4; ++J) {
+        // Compute SAD of SourceB[4*J..4*J+3] vs blockA from SourceA
+        unsigned SadA = 0;
+        unsigned SadB = 0;
+        for (unsigned K = 0; K < 4; ++K) {
+          // Treat input bytes as unsigned
+          unsigned A = static_cast<uint8_t>(
+              SourceA.getVectorElt(LaneStart + BlockOffsetA + K)
+                  .getInt()
+                  .getZExtValue());
+          unsigned B = static_cast<uint8_t>(
+              SourceB.getVectorElt(LaneStart + 4 * J + K)
+                  .getInt()
+                  .getZExtValue());
+          SadA += (B > A) ? (B - A) : (A - B);
+
+          unsigned A2 = static_cast<uint8_t>(
+              SourceA.getVectorElt(LaneStart + BlockOffsetB + K)
+                  .getInt()
+                  .getZExtValue());
+          SadB += (B > A2) ? (B - A2) : (A2 - B);
+        }
+        ResultElements.push_back(
+            APValue(APSInt(APInt(16, SadA), DestUnsigned)));
+        ResultElements.push_back(
+            APValue(APSInt(APInt(16, SadB), DestUnsigned)));
+      }
+    }
+
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+
   case clang::X86::BI__builtin_ia32_pmulhuw128:
   case clang::X86::BI__builtin_ia32_pmulhuw256:
   case clang::X86::BI__builtin_ia32_pmulhuw512:
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c 
b/clang/test/CodeGen/X86/avx512bw-builtins.c
index 2020b72a649ae..488146e740db4 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -3230,21 +3230,36 @@ 
TEST_CONSTEXPR(match_v64qi(_mm512_maskz_alignr_epi8((__mmask64)0x000000000000000
 __m512i test_mm512_mm_dbsad_epu8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mm_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.512
-  return _mm512_dbsad_epu8(__A, __B, 170); 
-}
+  return _mm512_dbsad_epu8(__A, __B, 170);
+}
+// 512-bit: 4 lanes, imm8=0: blockA=blockB=lane[0..3] for each lane
+// Each lane behaves the same as the 128-bit case with matching data
+TEST_CONSTEXPR(match_v32hu(_mm512_dbsad_epu8(
+  ((__m512i)(__v64qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
+  ((__m512i)(__v64qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
+  0), 4, 4, 20, 20, 36, 36, 52, 52,
+      4, 4, 20, 20, 36, 36, 52, 52,
+      4, 4, 20, 20, 36, 36, 52, 52,
+      4, 4, 20, 20, 36, 36, 52, 52));
 
 __m512i test_mm512_mm_mask_dbsad_epu8(__m512i __W, __mmask32 __U, __m512i __A, 
__m512i __B) {
   // CHECK-LABEL: test_mm512_mm_mask_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.512
   //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
-  return _mm512_mask_dbsad_epu8(__W, __U, __A, __B, 170); 
+  return _mm512_mask_dbsad_epu8(__W, __U, __A, __B, 170);
 }
 
 __m512i test_mm512_mm_maskz_dbsad_epu8(__mmask32 __U, __m512i __A, __m512i 
__B) {
   // CHECK-LABEL: test_mm512_mm_maskz_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.512
   //CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
-  return _mm512_maskz_dbsad_epu8(__U, __A, __B, 170); 
+  return _mm512_maskz_dbsad_epu8(__U, __A, __B, 170);
 }
 
 __m512i test_mm512_sad_epu8(__m512i __A, __m512i __B) {
diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c 
b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
index 0ee14909ae805..098ee29b1989e 100644
--- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -3676,41 +3676,83 @@ 
TEST_CONSTEXPR(match_v32qi(_mm256_maskz_alignr_epi8((__mmask32)0xf000000f, ((__m
 __m128i test_mm_dbsad_epu8(__m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.128
-  return _mm_dbsad_epu8(__A, __B, 170); 
-}
+  return _mm_dbsad_epu8(__A, __B, 170);
+}
+// imm8=4: blockA=A[0..3]={0,1,2,3}, blockB=A[4..7]={4,5,6,7}
+// J=0: B[0..3]={1,2,3,4} vs blockA=4, vs blockB=12
+// J=1: B[4..7]={5,6,7,8} vs blockA=20, vs blockB=4
+// J=2: B[8..11]={9,10,11,12} vs blockA=36, vs blockB=20
+// J=3: B[12..15]={13,14,15,16} vs blockA=52, vs blockB=36
+TEST_CONSTEXPR(match_v8hu(_mm_dbsad_epu8(
+  ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
+  ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
+  4), 4, 12, 20, 4, 36, 20, 52, 36));
+// imm8=0: blockA=blockB=A[0..3]={0,1,2,3}
+TEST_CONSTEXPR(match_v8hu(_mm_dbsad_epu8(
+  ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
+  ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
+  0), 4, 4, 20, 20, 36, 36, 52, 52));
+// Test with unsigned values > 127 (signed overflow territory)
+// A[0..3]={200,100,50,25}, B[0..3]={180,120,40,30}
+// imm8=0: blockA=blockB=A[0..3]
+// SAD = |180-200|+|120-100|+|40-50|+|30-25| = 20+20+10+5 = 55
+TEST_CONSTEXPR(match_v8hu(_mm_dbsad_epu8(
+  ((__m128i)(__v16qu){200, 100, 50, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}),
+  ((__m128i)(__v16qu){180, 120, 40, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}),
+  0), 55, 55, 375, 375, 375, 375, 375, 375));
 
 __m128i test_mm_mask_dbsad_epu8(__m128i __W, __mmask8 __U, __m128i __A, 
__m128i __B) {
   // CHECK-LABEL: test_mm_mask_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.128
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
-  return _mm_mask_dbsad_epu8(__W, __U, __A, __B, 170); 
+  return _mm_mask_dbsad_epu8(__W, __U, __A, __B, 170);
 }
+// Test masked version: mask=0x55 (keep even elements, passthrough odd)
+TEST_CONSTEXPR(match_v8hu(_mm_mask_dbsad_epu8(
+  ((__m128i)(__v8hu){99, 99, 99, 99, 99, 99, 99, 99}), (__mmask8)0x55,
+  ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
+  ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
+  4), 4, 99, 20, 99, 36, 99, 52, 99));
 
 __m128i test_mm_maskz_dbsad_epu8(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.128
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
-  return _mm_maskz_dbsad_epu8(__U, __A, __B, 170); 
+  return _mm_maskz_dbsad_epu8(__U, __A, __B, 170);
 }
+// Test zero-masked version: mask=0xAA (keep odd elements, zero even)
+TEST_CONSTEXPR(match_v8hu(_mm_maskz_dbsad_epu8((__mmask8)0xAA,
+  ((__m128i)(__v16qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}),
+  ((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}),
+  4), 0, 12, 0, 4, 0, 20, 0, 36));
 
 __m256i test_mm256_dbsad_epu8(__m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.256
-  return _mm256_dbsad_epu8(__A, __B, 170); 
-}
+  return _mm256_dbsad_epu8(__A, __B, 170);
+}
+// 256-bit: 2 lanes, imm8=0: blockA=blockB=lane[0..3]
+// Lane 0: same as 128-bit test above
+// Lane 1: A[16..19]={16,17,18,19}, B[16..19]={17,18,19,20} -> SAD=4
+TEST_CONSTEXPR(match_v16hu(_mm256_dbsad_epu8(
+  ((__m256i)(__v32qu){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 
30, 31}),
+  ((__m256i)(__v32qu){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                     17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 
31, 32}),
+  0), 4, 4, 20, 20, 36, 36, 52, 52, 4, 4, 20, 20, 36, 36, 52, 52));
 
 __m256i test_mm256_mask_dbsad_epu8(__m256i __W, __mmask16 __U, __m256i __A, 
__m256i __B) {
   // CHECK-LABEL: test_mm256_mask_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.256
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
-  return _mm256_mask_dbsad_epu8(__W, __U, __A, __B, 170); 
+  return _mm256_mask_dbsad_epu8(__W, __U, __A, __B, 170);
 }
 
 __m256i test_mm256_maskz_dbsad_epu8(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_dbsad_epu8
   // CHECK: @llvm.x86.avx512.dbpsadbw.256
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
-  return _mm256_maskz_dbsad_epu8(__U, __A, __B, 170); 
+  return _mm256_maskz_dbsad_epu8(__U, __A, __B, 170);
 }
 __mmask8 test_mm_movepi16_mask(__m128i __A) {
   // CHECK-LABEL: test_mm_movepi16_mask

>From 8b292ead3eb31c002f6deb2e7179b1b208f6076d Mon Sep 17 00:00:00 2001
From: Pierluigi Lenoci <[email protected]>
Date: Fri, 27 Mar 2026 18:43:08 +0100
Subject: [PATCH 2/3] fix: address reviewer feedback for constexpr VDBPSADBW

- Add Constexpr tags to BuiltinsX86.td for VDBPSADBW builtins
- Update InterpBuiltin.cpp per tbaederr's suggestions:
  - Use popToUInt64 instead of popToAPSInt for immediate value
  - Use != instead of < in loop comparison
  - Simplify element access by removing unnecessary toAPSInt().getZExtValue()
- Apply clang-format fix in ExprConstant.cpp

Signed-off-by: Pierluigi Lenoci <[email protected]>
---
 clang/include/clang/Basic/BuiltinsX86.td |  6 +++---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 12 +++++-------
 clang/lib/AST/ExprConstant.cpp           |  8 ++++----
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td 
b/clang/include/clang/Basic/BuiltinsX86.td
index f47532a63de04..e54f8d66843bf 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -3197,15 +3197,15 @@ let Features = "avx512bw", Attributes = [NoThrow, 
Const, Constexpr, RequiredVect
   def palignr512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, 
_Vector<64, char>, _Constant int)">;
 }
 
-let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, 
RequiredVectorWidth<128>] in {
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, 
RequiredVectorWidth<128>] in {
   def dbpsadbw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, 
_Vector<16, char>, _Constant int)">;
 }
 
-let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, 
RequiredVectorWidth<256>] in {
+let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, 
RequiredVectorWidth<256>] in {
   def dbpsadbw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, 
_Vector<32, char>, _Constant int)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const, 
RequiredVectorWidth<512>] in {
+let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, 
RequiredVectorWidth<512>] in {
   def dbpsadbw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, 
_Vector<64, char>, _Constant int)">;
   def psadbw512 : X86Builtin<"_Vector<8, long long int>(_Vector<64, char>, 
_Vector<64, char>)">;
 }
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp 
b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index d9a14f84e4a8a..5d46b2c595b1f 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2823,9 +2823,7 @@ static bool interp__builtin_ia32_pmul(
 static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC,
                                           const CallExpr *Call) {
   assert(Call->getNumArgs() == 3);
-  QualType Arg2Type = Call->getArg(2)->getType();
-  APSInt ImmVal = popToAPSInt(S, Arg2Type);
-  unsigned Imm = ImmVal.getZExtValue();
+  unsigned Imm = popToUInt64(S, Call->getArg(2));
 
   const Pointer &Src2 = S.Stk.pop<Pointer>();
   const Pointer &Src1 = S.Stk.pop<Pointer>();
@@ -2845,7 +2843,7 @@ static bool interp__builtin_ia32_dbpsadbw(InterpState &S, 
CodePtr OpPC,
   unsigned BlockOffsetB = ((Imm >> 2) & 0x3) * 4;
 
   unsigned DstIdx = 0;
-  for (unsigned Lane = 0; Lane < NumLanes; ++Lane) {
+  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
     unsigned LaneStart = Lane * LaneSize;
 
     for (unsigned J = 0; J < 4; ++J) {
@@ -2856,11 +2854,11 @@ static bool interp__builtin_ia32_dbpsadbw(InterpState 
&S, CodePtr OpPC,
         INT_TYPE_SWITCH_NO_BOOL(SrcElemT, {
           // Treat as unsigned bytes
           A1Val = static_cast<uint8_t>(
-              Src1.elem<T>(LaneStart + BlockOffsetA + 
K).toAPSInt().getZExtValue());
+              Src1.elem<T>(LaneStart + BlockOffsetA + K));
           A2Val = static_cast<uint8_t>(
-              Src1.elem<T>(LaneStart + BlockOffsetB + 
K).toAPSInt().getZExtValue());
+              Src1.elem<T>(LaneStart + BlockOffsetB + K));
           BVal = static_cast<uint8_t>(
-              Src2.elem<T>(LaneStart + 4 * J + K).toAPSInt().getZExtValue());
+              Src2.elem<T>(LaneStart + 4 * J + K));
         });
         SadA += (BVal > A1Val) ? (BVal - A1Val) : (A1Val - BVal);
         SadB += (BVal > A2Val) ? (BVal - A2Val) : (A2Val - BVal);
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index fc4a4834b462a..2a6e1713fba4d 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12599,10 +12599,10 @@ bool VectorExprEvaluator::VisitCallExpr(const 
CallExpr *E) {
               SourceA.getVectorElt(LaneStart + BlockOffsetA + K)
                   .getInt()
                   .getZExtValue());
-          unsigned B = static_cast<uint8_t>(
-              SourceB.getVectorElt(LaneStart + 4 * J + K)
-                  .getInt()
-                  .getZExtValue());
+          unsigned B =
+              static_cast<uint8_t>(SourceB.getVectorElt(LaneStart + 4 * J + K)
+                                       .getInt()
+                                       .getZExtValue());
           SadA += (B > A) ? (B - A) : (A - B);
 
           unsigned A2 = static_cast<uint8_t>(

>From b2ef04823423b38720746a63a0a661a54852de90 Mon Sep 17 00:00:00 2001
From: Pierluigi Lenoci <[email protected]>
Date: Sat, 28 Mar 2026 16:50:42 +0100
Subject: [PATCH 3/3] style: apply clang-format to modified files

Signed-off-by: Pierluigi Lenoci <[email protected]>
---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp 
b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 5d46b2c595b1f..4ba611cf68013 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2853,12 +2853,11 @@ static bool interp__builtin_ia32_dbpsadbw(InterpState 
&S, CodePtr OpPC,
         unsigned A1Val, A2Val, BVal;
         INT_TYPE_SWITCH_NO_BOOL(SrcElemT, {
           // Treat as unsigned bytes
-          A1Val = static_cast<uint8_t>(
-              Src1.elem<T>(LaneStart + BlockOffsetA + K));
-          A2Val = static_cast<uint8_t>(
-              Src1.elem<T>(LaneStart + BlockOffsetB + K));
-          BVal = static_cast<uint8_t>(
-              Src2.elem<T>(LaneStart + 4 * J + K));
+          A1Val =
+              static_cast<uint8_t>(Src1.elem<T>(LaneStart + BlockOffsetA + K));
+          A2Val =
+              static_cast<uint8_t>(Src1.elem<T>(LaneStart + BlockOffsetB + K));
+          BVal = static_cast<uint8_t>(Src2.elem<T>(LaneStart + 4 * J + K));
         });
         SadA += (BVal > A1Val) ? (BVal - A1Val) : (A1Val - BVal);
         SadB += (BVal > A2Val) ? (BVal - A2Val) : (A2Val - BVal);

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [Clang] Allow VDBPSADBW intrinsics in constexpr (PR #188887)

Reply via email to