https://github.com/eisenwave created https://github.com/llvm/llvm-project/pull/204296
Closes #204126 This PR adds `__builtin_elementwise_pext` to emit `@llvm.pext` and `__builtin_elementwise_pdep` to emit `@llvm.pdep`. The approach here is a carbon copy of #196633, which recently added `__builtin_elementwise_clmul`. >From f67a8bccdac0215ac7415fd78a4d5807514f8d35 Mon Sep 17 00:00:00 2001 From: Eisenwave <[email protected]> Date: Wed, 17 Jun 2026 08:31:52 +0200 Subject: [PATCH] [clang] Implement __builtin_elementwise_pext and __builtin_elementwise_pdep --- clang/docs/LanguageExtensions.rst | 4 ++ clang/docs/ReleaseNotes.rst | 4 ++ clang/include/clang/Basic/Builtins.td | 12 ++++ clang/lib/AST/ByteCode/InterpBuiltin.cpp | 2 + clang/lib/AST/ExprConstant.cpp | 10 ++- clang/lib/CodeGen/CGBuiltin.cpp | 6 ++ clang/lib/Sema/SemaChecking.cpp | 2 + clang/test/AST/ByteCode/builtin-functions.cpp | 40 +++++++++++ .../test/CodeGen/builtins-elementwise-math.c | 66 ++++++++++++++++++ clang/test/Sema/builtins-elementwise-math.c | 68 +++++++++++++++++++ 10 files changed, 212 insertions(+), 2 deletions(-) diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index f378a73c20de0..3952cdeed4b77 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -905,6 +905,10 @@ T __builtin_elementwise_fshr(T x, T y, T z) perform a funnel shift right. Co first argument is 0 and no second argument is provided. T __builtin_elementwise_clmul(T x, T y) perform a carry-less multiplication of x and y, returning the least integer types significant bits of the wide result. +T __builtin_elementwise_pext(T x, T m) extract bits from x selected by the mask m, pack them contiguously integer types + into the least significant bits of the result, and zero the rest. +T __builtin_elementwise_pdep(T x, T m) deposit the least significant bits of x at the positions integer types + where m has a 1-bit, and zero the rest. ============================================== ====================================================================== ========================================= diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 7828135a6edbc..1afc4cf4c3eca 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -305,6 +305,10 @@ Non-comprehensive list of changes in this release integers including ``_BitInt`` types. This includes constexpr evaluation support. +- Added ``__builtin_elementwise_pext`` and ``__builtin_elementwise_pdep`` for + parallel bit extract and parallel bit deposit operations on integers including + ``_BitInt`` types. This includes constexpr evaluation support. + - Deprecated float types support from ``__builtin_elementwise_max`` and ``__builtin_elementwise_min``. diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 25abd1f36539e..d873556ad5abb 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -1835,6 +1835,18 @@ def ElementwiseClmul : Builtin { let Prototype = "void(...)"; } +def ElementwisePext : Builtin { + let Spellings = ["__builtin_elementwise_pext"]; + let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr]; + let Prototype = "void(...)"; +} + +def ElementwisePdep : Builtin { + let Spellings = ["__builtin_elementwise_pdep"]; + let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr]; + let Prototype = "void(...)"; +} + def ReduceMax : Builtin { let Spellings = ["__builtin_reduce_max"]; let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr]; diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 55907bf11506b..15b143d7dbbba 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -5116,11 +5116,13 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case clang::X86::BI__builtin_ia32_pdep_si: case clang::X86::BI__builtin_ia32_pdep_di: + case Builtin::BI__builtin_elementwise_pdep: return interp__builtin_elementwise_int_binop(S, OpPC, Call, llvm::APIntOps::expandBits); case clang::X86::BI__builtin_ia32_pext_si: case clang::X86::BI__builtin_ia32_pext_di: + case Builtin::BI__builtin_elementwise_pext: return interp__builtin_elementwise_int_binop(S, OpPC, Call, llvm::APIntOps::compressBits); diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index bc98c0d86bb65..250de0321130e 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -14120,6 +14120,10 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { } case Builtin::BI__builtin_elementwise_clmul: return EvaluateBinOpExpr(llvm::APIntOps::clmul); + case Builtin::BI__builtin_elementwise_pext: + return EvaluateBinOpExpr(llvm::APIntOps::compressBits); + case Builtin::BI__builtin_elementwise_pdep: + return EvaluateBinOpExpr(llvm::APIntOps::expandBits); case Builtin::BI__builtin_elementwise_fshl: case Builtin::BI__builtin_elementwise_fshr: { APValue SourceHi, SourceLo, SourceShift; @@ -17920,7 +17924,8 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, } case clang::X86::BI__builtin_ia32_pdep_si: - case clang::X86::BI__builtin_ia32_pdep_di: { + case clang::X86::BI__builtin_ia32_pdep_di: + case Builtin::BI__builtin_elementwise_pdep: { APSInt Val, Msk; if (!EvaluateInteger(E->getArg(0), Val, Info) || !EvaluateInteger(E->getArg(1), Msk, Info)) @@ -17929,7 +17934,8 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, } case clang::X86::BI__builtin_ia32_pext_si: - case clang::X86::BI__builtin_ia32_pext_di: { + case clang::X86::BI__builtin_ia32_pext_di: + case Builtin::BI__builtin_elementwise_pext: { APSInt Val, Msk; if (!EvaluateInteger(E->getArg(0), Val, Info) || !EvaluateInteger(E->getArg(1), Msk, Info)) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 682b125890fe1..23605f4158141 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -4544,6 +4544,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_elementwise_clmul: return RValue::get( emitBuiltinWithOneOverloadedType<2>(*this, E, Intrinsic::clmul)); + case Builtin::BI__builtin_elementwise_pext: + return RValue::get( + emitBuiltinWithOneOverloadedType<2>(*this, E, Intrinsic::pext)); + case Builtin::BI__builtin_elementwise_pdep: + return RValue::get( + emitBuiltinWithOneOverloadedType<2>(*this, E, Intrinsic::pdep)); case Builtin::BI__builtin_elementwise_add_sat: case Builtin::BI__builtin_elementwise_sub_sat: { diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index b8a3f48a32f24..ec4a9037f5c23 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3701,6 +3701,8 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID, case Builtin::BI__builtin_elementwise_add_sat: case Builtin::BI__builtin_elementwise_sub_sat: case Builtin::BI__builtin_elementwise_clmul: + case Builtin::BI__builtin_elementwise_pext: + case Builtin::BI__builtin_elementwise_pdep: if (BuiltinElementwiseMath(TheCall, EltwiseBuiltinArgTyRestriction::IntegerTy)) return ExprError(); diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp index 57157392f6a6e..24f5c1745c2b8 100644 --- a/clang/test/AST/ByteCode/builtin-functions.cpp +++ b/clang/test/AST/ByteCode/builtin-functions.cpp @@ -1402,6 +1402,46 @@ namespace ElementwiseClmul { (vector4uint){0U, 1U, 3U, 7U})) == 27U); } +namespace ElementwisePext { + static_assert(__builtin_elementwise_pext(0U, 0U) == 0U); + static_assert(__builtin_elementwise_pext(0xFFU, 0xFFU) == 0xFFU); + static_assert(__builtin_elementwise_pext(0xFFU, 0x0FU) == 0x0FU); + static_assert(__builtin_elementwise_pext(0xFFU, 0xF0U) == 0x0FU); + static_assert(__builtin_elementwise_pext(0b1010'1010U, 0b1100'1100U) == + 0b0000'1010U); + static_assert(__builtin_elementwise_pext(0b1111'1111U, 0b1010'1010U) == + 0b0000'1111U); +#ifndef __AVR__ + static_assert(__builtin_elementwise_pext((unsigned _BitInt(31))0xFF, + (unsigned _BitInt(31))0x0F) == + (unsigned _BitInt(31))0x0F); +#endif + + static_assert(__builtin_reduce_add(__builtin_elementwise_pext( + (vector4uint){0xAAU, 0xFFU, 0x55U, 0x00U}, + (vector4uint){0xCCU, 0xAAU, 0x0FU, 0x00U})) == 0x1EU); +} + +namespace ElementwisePdep { + static_assert(__builtin_elementwise_pdep(0U, 0U) == 0U); + static_assert(__builtin_elementwise_pdep(0xFFU, 0xFFU) == 0xFFU); + static_assert(__builtin_elementwise_pdep(0x0FU, 0xFFU) == 0x0FU); + static_assert(__builtin_elementwise_pdep(0x0FU, 0xF0U) == 0xF0U); + static_assert(__builtin_elementwise_pdep(0b0000'1010U, 0b1100'1100U) == + 0b1000'1000U); + static_assert(__builtin_elementwise_pdep(0b0000'1111U, 0b1010'1010U) == + 0b1010'1010U); +#ifndef __AVR__ + static_assert(__builtin_elementwise_pdep((unsigned _BitInt(31))0x0F, + (unsigned _BitInt(31))0xFF) == + (unsigned _BitInt(31))0x0F); +#endif + + static_assert(__builtin_reduce_add(__builtin_elementwise_pdep( + (vector4uint){0x0AU, 0x0FU, 0x05U, 0x00U}, + (vector4uint){0xCCU, 0xAAU, 0x0FU, 0x00U})) == 0x137U); +} + namespace BuiltinMemcpy { constexpr int simple() { int a = 12; diff --git a/clang/test/CodeGen/builtins-elementwise-math.c b/clang/test/CodeGen/builtins-elementwise-math.c index d0e4a6fa10cfc..63371ea729228 100644 --- a/clang/test/CodeGen/builtins-elementwise-math.c +++ b/clang/test/CodeGen/builtins-elementwise-math.c @@ -1330,6 +1330,72 @@ void test_builtin_elementwise_clmul(unsigned int ui1, unsigned int ui2, bi1 = __builtin_elementwise_clmul(bi1, bi2); } +void test_builtin_elementwise_pext(unsigned int ui1, unsigned int ui2, + unsigned short us1, unsigned short us2, + u4 vu1, u4 vu2, + unsigned _BitInt(31) bi1, + unsigned _BitInt(31) bi2) { + // CHECK: [[UI1:%.+]] = load i32, ptr %ui1.addr, align 4 + // CHECK-NEXT: [[UI2:%.+]] = load i32, ptr %ui2.addr, align 4 + // CHECK-NEXT: [[UI3:%.+]] = call i32 @llvm.pext.i32(i32 [[UI1]], i32 [[UI2]]) + // CHECK-NEXT: store i32 [[UI3]], ptr %ui1.addr, align 4 + ui1 = __builtin_elementwise_pext(ui1, ui2); + + // CHECK: [[US1:%.+]] = load i16, ptr %us1.addr, align 2 + // CHECK-NEXT: [[US2:%.+]] = load i16, ptr %us2.addr, align 2 + // CHECK-NEXT: [[US3:%.+]] = call i16 @llvm.pext.i16(i16 [[US1]], i16 [[US2]]) + // CHECK-NEXT: store i16 [[US3]], ptr %us1.addr, align 2 + us1 = __builtin_elementwise_pext(us1, us2); + + // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16 + // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16 + // CHECK-NEXT: [[VU3:%.+]] = call <4 x i32> @llvm.pext.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]]) + // CHECK-NEXT: store <4 x i32> [[VU3]], ptr %vu1.addr, align 16 + vu1 = __builtin_elementwise_pext(vu1, vu2); + + // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4 + // CHECK-NEXT: [[BI1TRUNC:%.+]] = trunc i32 [[BI1]] to i31 + // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4 + // CHECK-NEXT: [[BI2TRUNC:%.+]] = trunc i32 [[BI2]] to i31 + // CHECK-NEXT: [[BIRES:%.+]] = call i31 @llvm.pext.i31(i31 [[BI1TRUNC]], i31 [[BI2TRUNC]]) + // CHECK-NEXT: [[BIRESZEXT:%.+]] = zext i31 [[BIRES]] to i32 + // CHECK-NEXT: store i32 [[BIRESZEXT]], ptr %bi1.addr, align 4 + bi1 = __builtin_elementwise_pext(bi1, bi2); +} + +void test_builtin_elementwise_pdep(unsigned int ui1, unsigned int ui2, + unsigned short us1, unsigned short us2, + u4 vu1, u4 vu2, + unsigned _BitInt(31) bi1, + unsigned _BitInt(31) bi2) { + // CHECK: [[UI1:%.+]] = load i32, ptr %ui1.addr, align 4 + // CHECK-NEXT: [[UI2:%.+]] = load i32, ptr %ui2.addr, align 4 + // CHECK-NEXT: [[UI3:%.+]] = call i32 @llvm.pdep.i32(i32 [[UI1]], i32 [[UI2]]) + // CHECK-NEXT: store i32 [[UI3]], ptr %ui1.addr, align 4 + ui1 = __builtin_elementwise_pdep(ui1, ui2); + + // CHECK: [[US1:%.+]] = load i16, ptr %us1.addr, align 2 + // CHECK-NEXT: [[US2:%.+]] = load i16, ptr %us2.addr, align 2 + // CHECK-NEXT: [[US3:%.+]] = call i16 @llvm.pdep.i16(i16 [[US1]], i16 [[US2]]) + // CHECK-NEXT: store i16 [[US3]], ptr %us1.addr, align 2 + us1 = __builtin_elementwise_pdep(us1, us2); + + // CHECK: [[VU1:%.+]] = load <4 x i32>, ptr %vu1.addr, align 16 + // CHECK-NEXT: [[VU2:%.+]] = load <4 x i32>, ptr %vu2.addr, align 16 + // CHECK-NEXT: [[VU3:%.+]] = call <4 x i32> @llvm.pdep.v4i32(<4 x i32> [[VU1]], <4 x i32> [[VU2]]) + // CHECK-NEXT: store <4 x i32> [[VU3]], ptr %vu1.addr, align 16 + vu1 = __builtin_elementwise_pdep(vu1, vu2); + + // CHECK: [[BI1:%.+]] = load i32, ptr %bi1.addr, align 4 + // CHECK-NEXT: [[BI1TRUNC:%.+]] = trunc i32 [[BI1]] to i31 + // CHECK-NEXT: [[BI2:%.+]] = load i32, ptr %bi2.addr, align 4 + // CHECK-NEXT: [[BI2TRUNC:%.+]] = trunc i32 [[BI2]] to i31 + // CHECK-NEXT: [[BIRES:%.+]] = call i31 @llvm.pdep.i31(i31 [[BI1TRUNC]], i31 [[BI2TRUNC]]) + // CHECK-NEXT: [[BIRESZEXT:%.+]] = zext i31 [[BIRES]] to i32 + // CHECK-NEXT: store i32 [[BIRESZEXT]], ptr %bi1.addr, align 4 + bi1 = __builtin_elementwise_pdep(bi1, bi2); +} + void test_builtin_elementwise_clzg(si8 vs1, si8 vs2, u4 vu1, long long int lli, short si, _BitInt(31) bi, int i, diff --git a/clang/test/Sema/builtins-elementwise-math.c b/clang/test/Sema/builtins-elementwise-math.c index 4ffdcee3ca9c7..511d1d8b43329 100644 --- a/clang/test/Sema/builtins-elementwise-math.c +++ b/clang/test/Sema/builtins-elementwise-math.c @@ -214,6 +214,74 @@ void test_builtin_elementwise_clmul(int i, short s, double d, float4 v, vu = __builtin_elementwise_clmul(vu, vu); } +void test_builtin_elementwise_pext(int i, short s, double d, float4 v, + int3 iv, unsigned3 uv, unsigned u, + unsigned4 vu, int *p) { + i = __builtin_elementwise_pext(p, d); + // expected-error@-1 {{1st argument must be a scalar or vector of integer types (was 'int *')}} + + struct Foo foo = __builtin_elementwise_pext(i, i); + // expected-error@-1 {{initializing 'struct Foo' with an expression of incompatible type 'int'}} + + i = __builtin_elementwise_pext(i); + // expected-error@-1 {{too few arguments to function call, expected 2, have 1}} + + i = __builtin_elementwise_pext(); + // expected-error@-1 {{too few arguments to function call, expected 2, have 0}} + + i = __builtin_elementwise_pext(i, i, i); + // expected-error@-1 {{too many arguments to function call, expected 2, have 3}} + + i = __builtin_elementwise_pext(v, v); + // expected-error@-1 {{1st argument must be a scalar or vector of integer types (was 'float4' (vector of 4 'float' values))}} + + i = __builtin_elementwise_pext(i, s); + // expected-error@-1 {{arguments are of different types ('int' vs 'short')}} + + i = __builtin_elementwise_pext(uv, iv); + // expected-error@-1 {{arguments are of different types ('unsigned3' (vector of 3 'unsigned int' values) vs 'int3' (vector of 3 'int' values))}} + + unsigned _BitInt(31) ext; // expected-warning {{'_BitInt' in C17 and earlier is a Clang extension}} + ext = __builtin_elementwise_pext(ext, ext); + + u = __builtin_elementwise_pext(u, u); + vu = __builtin_elementwise_pext(vu, vu); +} + +void test_builtin_elementwise_pdep(int i, short s, double d, float4 v, + int3 iv, unsigned3 uv, unsigned u, + unsigned4 vu, int *p) { + i = __builtin_elementwise_pdep(p, d); + // expected-error@-1 {{1st argument must be a scalar or vector of integer types (was 'int *')}} + + struct Foo foo = __builtin_elementwise_pdep(i, i); + // expected-error@-1 {{initializing 'struct Foo' with an expression of incompatible type 'int'}} + + i = __builtin_elementwise_pdep(i); + // expected-error@-1 {{too few arguments to function call, expected 2, have 1}} + + i = __builtin_elementwise_pdep(); + // expected-error@-1 {{too few arguments to function call, expected 2, have 0}} + + i = __builtin_elementwise_pdep(i, i, i); + // expected-error@-1 {{too many arguments to function call, expected 2, have 3}} + + i = __builtin_elementwise_pdep(v, v); + // expected-error@-1 {{1st argument must be a scalar or vector of integer types (was 'float4' (vector of 4 'float' values))}} + + i = __builtin_elementwise_pdep(i, s); + // expected-error@-1 {{arguments are of different types ('int' vs 'short')}} + + i = __builtin_elementwise_pdep(uv, iv); + // expected-error@-1 {{arguments are of different types ('unsigned3' (vector of 3 'unsigned int' values) vs 'int3' (vector of 3 'int' values))}} + + unsigned _BitInt(31) ext; // expected-warning {{'_BitInt' in C17 and earlier is a Clang extension}} + ext = __builtin_elementwise_pdep(ext, ext); + + u = __builtin_elementwise_pdep(u, u); + vu = __builtin_elementwise_pdep(vu, vu); +} + void test_builtin_elementwise_max(int i, short s, double d, float4 v, int3 iv, unsigned3 uv, int *p) { i = __builtin_elementwise_max(p, d); // expected-error@-1 {{1st argument must be a vector, integer or floating-point type (was 'int *')}} _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
