[clang] [clang] Implement `__builtin_elementwise_pext` and `__builtin_elementwise_pdep` (PR #204296)

Jan Schultke via cfe-commits Tue, 16 Jun 2026 23:33:45 -0700

https://github.com/eisenwave created 
https://github.com/llvm/llvm-project/pull/204296


Closes #204126

This PR adds `__builtin_elementwise_pext` to emit `@llvm.pext` and 
`__builtin_elementwise_pdep` to emit `@llvm.pdep`.

The approach here is a carbon copy of #196633, which recently added 
`__builtin_elementwise_clmul`.

>From f67a8bccdac0215ac7415fd78a4d5807514f8d35 Mon Sep 17 00:00:00 2001
From: Eisenwave <[email protected]>
Date: Wed, 17 Jun 2026 08:31:52 +0200
Subject: [PATCH] [clang] Implement __builtin_elementwise_pext and
 __builtin_elementwise_pdep

---
 clang/docs/LanguageExtensions.rst             |  4 ++
 clang/docs/ReleaseNotes.rst                   |  4 ++
 clang/include/clang/Basic/Builtins.td         | 12 ++++
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      |  2 +
 clang/lib/AST/ExprConstant.cpp                | 10 ++-
 clang/lib/CodeGen/CGBuiltin.cpp               |  6 ++
 clang/lib/Sema/SemaChecking.cpp               |  2 +
 clang/test/AST/ByteCode/builtin-functions.cpp | 40 +++++++++++
 .../test/CodeGen/builtins-elementwise-math.c  | 66 ++++++++++++++++++
 clang/test/Sema/builtins-elementwise-math.c   | 68 +++++++++++++++++++
 10 files changed, 212 insertions(+), 2 deletions(-)

diff --git a/clang/docs/LanguageExtensions.rst 
b/clang/docs/LanguageExtensions.rst
index f378a73c20de0..3952cdeed4b77 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -905,6 +905,10 @@ T __builtin_elementwise_fshr(T x, T y, T z)     perform a 
funnel shift right. Co
                                                 first argument is 0 and no 
second argument is provided.
 T __builtin_elementwise_clmul(T x, T y)         perform a carry-less 
multiplication of x and y, returning the least    integer types
                                                 significant bits of the wide 
result.
+T __builtin_elementwise_pext(T x, T m)          extract bits from x selected 
by the mask m, pack them contiguously     integer types
+                                                into the least significant 
bits of the result, and zero the rest.
+T __builtin_elementwise_pdep(T x, T m)          deposit the least significant 
bits of x at the positions               integer types
+                                                where m has a 1-bit, and zero 
the rest.
 ============================================== 
====================================================================== 
=========================================
 
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7828135a6edbc..1afc4cf4c3eca 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -305,6 +305,10 @@ Non-comprehensive list of changes in this release
   integers including ``_BitInt`` types. This includes constexpr evaluation
   support.
 
+- Added ``__builtin_elementwise_pext`` and ``__builtin_elementwise_pdep`` for
+  parallel bit extract and parallel bit deposit operations on integers 
including
+  ``_BitInt`` types. This includes constexpr evaluation support.
+
 - Deprecated float types support from ``__builtin_elementwise_max`` and
   ``__builtin_elementwise_min``.
 
diff --git a/clang/include/clang/Basic/Builtins.td 
b/clang/include/clang/Basic/Builtins.td
index 25abd1f36539e..d873556ad5abb 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1835,6 +1835,18 @@ def ElementwiseClmul : Builtin {
   let Prototype = "void(...)";
 }
 
+def ElementwisePext : Builtin {
+  let Spellings = ["__builtin_elementwise_pext"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
+  let Prototype = "void(...)";
+}
+
+def ElementwisePdep : Builtin {
+  let Spellings = ["__builtin_elementwise_pdep"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
+  let Prototype = "void(...)";
+}
+
 def ReduceMax : Builtin {
   let Spellings = ["__builtin_reduce_max"];
   let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp 
b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 55907bf11506b..15b143d7dbbba 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -5116,11 +5116,13 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, 
const CallExpr *Call,
 
   case clang::X86::BI__builtin_ia32_pdep_si:
   case clang::X86::BI__builtin_ia32_pdep_di:
+  case Builtin::BI__builtin_elementwise_pdep:
     return interp__builtin_elementwise_int_binop(S, OpPC, Call,
                                                  llvm::APIntOps::expandBits);
 
   case clang::X86::BI__builtin_ia32_pext_si:
   case clang::X86::BI__builtin_ia32_pext_di:
+  case Builtin::BI__builtin_elementwise_pext:
     return interp__builtin_elementwise_int_binop(S, OpPC, Call,
                                                  llvm::APIntOps::compressBits);
 
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index bc98c0d86bb65..250de0321130e 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -14120,6 +14120,10 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr 
*E) {
   }
   case Builtin::BI__builtin_elementwise_clmul:
     return EvaluateBinOpExpr(llvm::APIntOps::clmul);
+  case Builtin::BI__builtin_elementwise_pext:
+    return EvaluateBinOpExpr(llvm::APIntOps::compressBits);
+  case Builtin::BI__builtin_elementwise_pdep:
+    return EvaluateBinOpExpr(llvm::APIntOps::expandBits);
   case Builtin::BI__builtin_elementwise_fshl:
   case Builtin::BI__builtin_elementwise_fshr: {
     APValue SourceHi, SourceLo, SourceShift;
@@ -17920,7 +17924,8 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const 
CallExpr *E,
   }
 
   case clang::X86::BI__builtin_ia32_pdep_si:
-  case clang::X86::BI__builtin_ia32_pdep_di: {
+  case clang::X86::BI__builtin_ia32_pdep_di:
+  case Builtin::BI__builtin_elementwise_pdep: {
     APSInt Val, Msk;
     if (!EvaluateInteger(E->getArg(0), Val, Info) ||
         !EvaluateInteger(E->getArg(1), Msk, Info))
@@ -17929,7 +17934,8 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const 
CallExpr *E,
   }
 
   case clang::X86::BI__builtin_ia32_pext_si:
-  case clang::X86::BI__builtin_ia32_pext_di: {
+  case clang::X86::BI__builtin_ia32_pext_di:
+  case Builtin::BI__builtin_elementwise_pext: {
     APSInt Val, Msk;
     if (!EvaluateInteger(E->getArg(0), Val, Info) ||
         !EvaluateInteger(E->getArg(1), Msk, Info))
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 682b125890fe1..23605f4158141 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4544,6 +4544,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
   case Builtin::BI__builtin_elementwise_clmul:
     return RValue::get(
         emitBuiltinWithOneOverloadedType<2>(*this, E, Intrinsic::clmul));
+  case Builtin::BI__builtin_elementwise_pext:
+    return RValue::get(
+        emitBuiltinWithOneOverloadedType<2>(*this, E, Intrinsic::pext));
+  case Builtin::BI__builtin_elementwise_pdep:
+    return RValue::get(
+        emitBuiltinWithOneOverloadedType<2>(*this, E, Intrinsic::pdep));
 
   case Builtin::BI__builtin_elementwise_add_sat:
   case Builtin::BI__builtin_elementwise_sub_sat: {
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index b8a3f48a32f24..ec4a9037f5c23 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3701,6 +3701,8 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, 
unsigned BuiltinID,
   case Builtin::BI__builtin_elementwise_add_sat:
   case Builtin::BI__builtin_elementwise_sub_sat:
   case Builtin::BI__builtin_elementwise_clmul:
+  case Builtin::BI__builtin_elementwise_pext:
+  case Builtin::BI__builtin_elementwise_pdep:
     if (BuiltinElementwiseMath(TheCall,
                                EltwiseBuiltinArgTyRestriction::IntegerTy))
       return ExprError();
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp 
b/clang/test/AST/ByteCode/builtin-functions.cpp
index 57157392f6a6e..24f5c1745c2b8 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -1402,6 +1402,46 @@ namespace ElementwiseClmul {
                     (vector4uint){0U, 1U, 3U, 7U})) == 27U);
 }
 
+namespace ElementwisePext {
+  static_assert(__builtin_elementwise_pext(0U, 0U) == 0U);
+  static_assert(__builtin_elementwise_pext(0xFFU, 0xFFU) == 0xFFU);
+  static_assert(__builtin_elementwise_pext(0xFFU, 0x0FU) == 0x0FU);
+  static_assert(__builtin_elementwise_pext(0xFFU, 0xF0U) == 0x0FU);
+  static_assert(__builtin_elementwise_pext(0b1010'1010U, 0b1100'1100U) ==
+                0b0000'1010U);
+  static_assert(__builtin_elementwise_pext(0b1111'1111U, 0b1010'1010U) ==
+                0b0000'1111U);
+#ifndef __AVR__
+  static_assert(__builtin_elementwise_pext((unsigned _BitInt(31))0xFF,
+                                           (unsigned _BitInt(31))0x0F) ==
+                (unsigned _BitInt(31))0x0F);
+#endif
+
+  static_assert(__builtin_reduce_add(__builtin_elementwise_pext(
+                    (vector4uint){0xAAU, 0xFFU, 0x55U, 0x00U},
+                    (vector4uint){0xCCU, 0xAAU, 0x0FU, 0x00U})) == 0x1EU);
+}
+
+namespace ElementwisePdep {
+  static_assert(__builtin_elementwise_pdep(0U, 0U) == 0U);
+  static_assert(__builtin_elementwise_pdep(0xFFU, 0xFFU) == 0xFFU);
+  static_assert(__builtin_elementwise_pdep(0x0FU, 0xFFU) == 0x0FU);
+  static_assert(__builtin_elementwise_pdep(0x0FU, 0xF0U) == 0xF0U);
+  static_assert(__builtin_elementwise_pdep(0b0000'1010U, 0b1100'1100U) ==
+                0b1000'1000U);
+  static_assert(__builtin_elementwise_pdep(0b0000'1111U, 0b1010'1010U) ==
+                0b1010'1010U);
+#ifndef __AVR__
+  static_assert(__builtin_elementwise_pdep((unsigned _BitInt(31))0x0F,
+                                           (unsigned _BitInt(31))0xFF) ==
+                (unsigned _BitInt(31))0x0F);
+#endif
+
+  static_assert(__builtin_reduce_add(__builtin_elementwise_pdep(
+                    (vector4uint){0x0AU, 0x0FU, 0x05U, 0x00U},
+                    (vector4uint){0xCCU, 0xAAU, 0x0FU, 0x00U})) == 0x137U);
+}
+
 namespace BuiltinMemcpy {
   constexpr int simple() {
     int a = 12;
diff --git a/clang/test/CodeGen/builtins-elementwise-math.c 
b/clang/test/CodeGen/builtins-elementwise-math.c
index d0e4a6fa10cfc..63371ea729228 100644
--- a/clang/test/CodeGen/builtins-elementwise-math.c
+++ b/clang/test/CodeGen/builtins-elementwise-math.c
@@ -1330,6 +1330,72 @@ void test_builtin_elementwise_clmul(unsigned int ui1, 
unsigned int ui2,
   bi1 = __builtin_elementwise_clmul(bi1, bi2);
 }
 
+void test_builtin_elementwise_pext(unsigned int ui1, unsigned int ui2,
+                                   unsigned short us1, unsigned short us2,
+                                   u4 vu1, u4 vu2,
+                                   unsigned _BitInt(31) bi1,
+                                   unsigned _BitInt(31) bi2) {
+  // CHECK:      [[UI1:%.+]] = load i32, ptr %ui1.addr, align 4
+  // CHECK-NEXT: [[UI2:%.+]] = load i32, ptr %ui2.addr, align 4
+  // CHECK-NEXT: [[UI3:%.+]] = call i32 @llvm.pext.i32(i32 [[UI1]], i32 
[[UI2]])
+  // CHECK-NEXT: store i32 [[UI3]], ptr %ui1.addr, align 4
+  ui1 = __builtin_elementwise_pext(ui1, ui2);
+
+  // CHECK:      [[US1:%.+]] = load i16, ptr %us1.addr, align 2
+  // CHECK-NEXT: [[US2:%.+]] = load i16, ptr %us2.addr, align 2
+  // CHECK-NEXT: [[US3:%.+]] = call i16 @llvm.pext.i16(i16 [[US1]], i16 
[[US2]])
+  // CHECK-NEXT: store i16 [[US3]], ptr %us1.addr, align 2
+  us1 = __builtin_elementwise_pext(us1, us2);
+
+  // CHECK:      [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
+  // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
+  // CHECK-NEXT: [[VU3:%.+]] = call <4 x i32> @llvm.pext.v4i32(<4 x i32> 
[[VU1]], <4 x i32> [[VU2]])
+  // CHECK-NEXT: store <4 x i32> [[VU3]], ptr %vu1.addr, align 16
+  vu1 = __builtin_elementwise_pext(vu1, vu2);
+
+  // CHECK:      [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
+  // CHECK-NEXT: [[BI1TRUNC:%.+]] = trunc i32 [[BI1]] to i31
+  // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4
+  // CHECK-NEXT: [[BI2TRUNC:%.+]] = trunc i32 [[BI2]] to i31
+  // CHECK-NEXT: [[BIRES:%.+]] = call i31 @llvm.pext.i31(i31 [[BI1TRUNC]], i31 
[[BI2TRUNC]])
+  // CHECK-NEXT: [[BIRESZEXT:%.+]] = zext i31 [[BIRES]] to i32
+  // CHECK-NEXT: store i32 [[BIRESZEXT]], ptr %bi1.addr, align 4
+  bi1 = __builtin_elementwise_pext(bi1, bi2);
+}
+
+void test_builtin_elementwise_pdep(unsigned int ui1, unsigned int ui2,
+                                   unsigned short us1, unsigned short us2,
+                                   u4 vu1, u4 vu2,
+                                   unsigned _BitInt(31) bi1,
+                                   unsigned _BitInt(31) bi2) {
+  // CHECK:      [[UI1:%.+]] = load i32, ptr %ui1.addr, align 4
+  // CHECK-NEXT: [[UI2:%.+]] = load i32, ptr %ui2.addr, align 4
+  // CHECK-NEXT: [[UI3:%.+]] = call i32 @llvm.pdep.i32(i32 [[UI1]], i32 
[[UI2]])
+  // CHECK-NEXT: store i32 [[UI3]], ptr %ui1.addr, align 4
+  ui1 = __builtin_elementwise_pdep(ui1, ui2);
+
+  // CHECK:      [[US1:%.+]] = load i16, ptr %us1.addr, align 2
+  // CHECK-NEXT: [[US2:%.+]] = load i16, ptr %us2.addr, align 2
+  // CHECK-NEXT: [[US3:%.+]] = call i16 @llvm.pdep.i16(i16 [[US1]], i16 
[[US2]])
+  // CHECK-NEXT: store i16 [[US3]], ptr %us1.addr, align 2
+  us1 = __builtin_elementwise_pdep(us1, us2);
+
+  // CHECK:      [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16
+  // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16
+  // CHECK-NEXT: [[VU3:%.+]] = call <4 x i32> @llvm.pdep.v4i32(<4 x i32> 
[[VU1]], <4 x i32> [[VU2]])
+  // CHECK-NEXT: store <4 x i32> [[VU3]], ptr %vu1.addr, align 16
+  vu1 = __builtin_elementwise_pdep(vu1, vu2);
+
+  // CHECK:      [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4
+  // CHECK-NEXT: [[BI1TRUNC:%.+]] = trunc i32 [[BI1]] to i31
+  // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4
+  // CHECK-NEXT: [[BI2TRUNC:%.+]] = trunc i32 [[BI2]] to i31
+  // CHECK-NEXT: [[BIRES:%.+]] = call i31 @llvm.pdep.i31(i31 [[BI1TRUNC]], i31 
[[BI2TRUNC]])
+  // CHECK-NEXT: [[BIRESZEXT:%.+]] = zext i31 [[BIRES]] to i32
+  // CHECK-NEXT: store i32 [[BIRESZEXT]], ptr %bi1.addr, align 4
+  bi1 = __builtin_elementwise_pdep(bi1, bi2);
+}
+
 void test_builtin_elementwise_clzg(si8 vs1, si8 vs2, u4 vu1,
                                    long long int lli, short si,
                                    _BitInt(31) bi, int i,
diff --git a/clang/test/Sema/builtins-elementwise-math.c 
b/clang/test/Sema/builtins-elementwise-math.c
index 4ffdcee3ca9c7..511d1d8b43329 100644
--- a/clang/test/Sema/builtins-elementwise-math.c
+++ b/clang/test/Sema/builtins-elementwise-math.c
@@ -214,6 +214,74 @@ void test_builtin_elementwise_clmul(int i, short s, double 
d, float4 v,
   vu = __builtin_elementwise_clmul(vu, vu);
 }
 
+void test_builtin_elementwise_pext(int i, short s, double d, float4 v,
+                                   int3 iv, unsigned3 uv, unsigned u,
+                                   unsigned4 vu, int *p) {
+  i = __builtin_elementwise_pext(p, d);
+  // expected-error@-1 {{1st argument must be a scalar or vector of integer 
types (was 'int *')}}
+
+  struct Foo foo = __builtin_elementwise_pext(i, i);
+  // expected-error@-1 {{initializing 'struct Foo' with an expression of 
incompatible type 'int'}}
+
+  i = __builtin_elementwise_pext(i);
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 
1}}
+
+  i = __builtin_elementwise_pext();
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 
0}}
+
+  i = __builtin_elementwise_pext(i, i, i);
+  // expected-error@-1 {{too many arguments to function call, expected 2, have 
3}}
+
+  i = __builtin_elementwise_pext(v, v);
+  // expected-error@-1 {{1st argument must be a scalar or vector of integer 
types (was 'float4' (vector of 4 'float' values))}}
+
+  i = __builtin_elementwise_pext(i, s);
+  // expected-error@-1 {{arguments are of different types ('int' vs 'short')}}
+
+  i = __builtin_elementwise_pext(uv, iv);
+  // expected-error@-1 {{arguments are of different types ('unsigned3' (vector 
of 3 'unsigned int' values) vs 'int3' (vector of 3 'int' values))}}
+
+  unsigned _BitInt(31) ext; // expected-warning {{'_BitInt' in C17 and earlier 
is a Clang extension}}
+  ext = __builtin_elementwise_pext(ext, ext);
+
+  u = __builtin_elementwise_pext(u, u);
+  vu = __builtin_elementwise_pext(vu, vu);
+}
+
+void test_builtin_elementwise_pdep(int i, short s, double d, float4 v,
+                                   int3 iv, unsigned3 uv, unsigned u,
+                                   unsigned4 vu, int *p) {
+  i = __builtin_elementwise_pdep(p, d);
+  // expected-error@-1 {{1st argument must be a scalar or vector of integer 
types (was 'int *')}}
+
+  struct Foo foo = __builtin_elementwise_pdep(i, i);
+  // expected-error@-1 {{initializing 'struct Foo' with an expression of 
incompatible type 'int'}}
+
+  i = __builtin_elementwise_pdep(i);
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 
1}}
+
+  i = __builtin_elementwise_pdep();
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 
0}}
+
+  i = __builtin_elementwise_pdep(i, i, i);
+  // expected-error@-1 {{too many arguments to function call, expected 2, have 
3}}
+
+  i = __builtin_elementwise_pdep(v, v);
+  // expected-error@-1 {{1st argument must be a scalar or vector of integer 
types (was 'float4' (vector of 4 'float' values))}}
+
+  i = __builtin_elementwise_pdep(i, s);
+  // expected-error@-1 {{arguments are of different types ('int' vs 'short')}}
+
+  i = __builtin_elementwise_pdep(uv, iv);
+  // expected-error@-1 {{arguments are of different types ('unsigned3' (vector 
of 3 'unsigned int' values) vs 'int3' (vector of 3 'int' values))}}
+
+  unsigned _BitInt(31) ext; // expected-warning {{'_BitInt' in C17 and earlier 
is a Clang extension}}
+  ext = __builtin_elementwise_pdep(ext, ext);
+
+  u = __builtin_elementwise_pdep(u, u);
+  vu = __builtin_elementwise_pdep(vu, vu);
+}
+
 void test_builtin_elementwise_max(int i, short s, double d, float4 v, int3 iv, 
unsigned3 uv, int *p) {
   i = __builtin_elementwise_max(p, d);
   // expected-error@-1 {{1st argument must be a vector, integer or 
floating-point type (was 'int *')}}

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [clang] Implement `__builtin_elementwise_pext` and `__builtin_elementwise_pdep` (PR #204296)

Reply via email to