[clang] [CIR] Upstream CIR codegen for blend x86 builtins (PR #174236)
https://github.com/Thibault-Monnier closed https://github.com/llvm/llvm-project/pull/174236 ___ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [CIR] Upstream CIR codegen for blend x86 builtins (PR #174236)
https://github.com/Thibault-Monnier updated
https://github.com/llvm/llvm-project/pull/174236
>From 855251d9f52f293aa18564f1b8f513728bf6a409 Mon Sep 17 00:00:00 2001
From: Thibault-Monnier
Date: Fri, 2 Jan 2026 20:25:17 +0100
Subject: [PATCH 1/2] Upstream CIR codegen for x86 blend builtins
---
clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp| 22 +--
.../CIR/CodeGenBuiltins/X86/avx-builtins.c| 24
.../CIR/CodeGenBuiltins/X86/avx2-builtins.c | 42 ++
.../CIR/CodeGenBuiltins/X86/sse41-builtins.c | 57 +--
4 files changed, 135 insertions(+), 10 deletions(-)
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 1c87e945de846..46de5bb5b9036 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -1275,6 +1275,10 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
const CallExpr *expr) {
}
case X86::BI__builtin_ia32_pmovqd512_mask:
case X86::BI__builtin_ia32_pmovwb512_mask:
+cgm.errorNYI(expr->getSourceRange(),
+ std::string("unimplemented X86 builtin call: ") +
+ getContext().BuiltinInfo.getName(builtinID));
+return mlir::Value{};
case X86::BI__builtin_ia32_pblendw128:
case X86::BI__builtin_ia32_blendpd:
case X86::BI__builtin_ia32_blendps:
@@ -1282,11 +1286,19 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
const CallExpr *expr) {
case X86::BI__builtin_ia32_blendps256:
case X86::BI__builtin_ia32_pblendw256:
case X86::BI__builtin_ia32_pblendd128:
- case X86::BI__builtin_ia32_pblendd256:
-cgm.errorNYI(expr->getSourceRange(),
- std::string("unimplemented X86 builtin call: ") +
- getContext().BuiltinInfo.getName(builtinID));
-return mlir::Value{};
+ case X86::BI__builtin_ia32_pblendd256: {
+uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
+unsigned numElts = cast(ops[0].getType()).getSize();
+
+int64_t indices[16];
+// If there are more than 8 elements, the immediate is used twice so make
+// sure we handle that.
+for (unsigned i = 0; i != numElts; ++i)
+ indices[i] = ((imm >> (i % 8)) & 0x1) ? numElts + i : i;
+
+return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1],
+ArrayRef(indices, numElts));
+ }
case X86::BI__builtin_ia32_pshuflw:
case X86::BI__builtin_ia32_pshuflw256:
case X86::BI__builtin_ia32_pshuflw512:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c
b/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c
index 0de782701ddc4..01ca55994ce50 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c
@@ -75,6 +75,30 @@ __m256i test_mm256_undefined_si256(void) {
return _mm256_undefined_si256();
}
+__m256d test_mm256_blend_pd(__m256d A, __m256d B) {
+ // CIR-LABEL: test_mm256_blend_pd
+ // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x
!cir.double>) [#cir.int<4> : !s32i, #cir.int<1> : !s32i, #cir.int<6> : !s32i,
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
+
+ // LLVM-LABEL: test_mm256_blend_pd
+ // LLVM: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32>
+
+ // OGCG-LABEL: test_mm256_blend_pd
+ // OGCG: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32>
+ return _mm256_blend_pd(A, B, 0x05);
+}
+
+__m256 test_mm256_blend_ps(__m256 A, __m256 B) {
+ // CIR-LABEL: test_mm256_blend_ps
+ // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x
!cir.float>) [#cir.int<8> : !s32i, #cir.int<1> : !s32i, #cir.int<10> : !s32i,
#cir.int<3> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<6> :
!s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.float>
+
+ // LLVM-LABEL: test_mm256_blend_ps
+ // LLVM: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32>
+
+ // OGCG-LABEL: test_mm256_blend_ps
+ // OGCG: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32>
+ return _mm256_blend_ps(A, B, 0x35);
+}
+
__m256d test_mm256_insertf128_pd(__m256d A, __m128d B) {
// CIR-LABEL: test_mm256_insertf128_pd
// %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x
!cir.double>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i,
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c
b/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c
index d4ed43ca1d26b..d3e54920ef186 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c
@@ -58,6 +58,48 @@ __m256i test1_mm256_inserti128_si256(__m256i a, __m128i b) {
return _mm256_inserti128_si256(a, b, 1);
}
+__m256i test_mm256_blend_epi16(__m256i a, __m256i b) {
+ // CIR-LABEL: _mm256_blend_epi16
+ // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}
[clang] [CIR] Upstream CIR codegen for blend x86 builtins (PR #174236)
@@ -1275,18 +1275,30 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
const CallExpr *expr) {
}
case X86::BI__builtin_ia32_pmovqd512_mask:
case X86::BI__builtin_ia32_pmovwb512_mask:
+cgm.errorNYI(expr->getSourceRange(),
+ std::string("unimplemented X86 builtin call: ") +
+ getContext().BuiltinInfo.getName(builtinID));
+return mlir::Value{};
case X86::BI__builtin_ia32_pblendw128:
case X86::BI__builtin_ia32_blendpd:
case X86::BI__builtin_ia32_blendps:
case X86::BI__builtin_ia32_blendpd256:
case X86::BI__builtin_ia32_blendps256:
case X86::BI__builtin_ia32_pblendw256:
case X86::BI__builtin_ia32_pblendd128:
- case X86::BI__builtin_ia32_pblendd256:
-cgm.errorNYI(expr->getSourceRange(),
- std::string("unimplemented X86 builtin call: ") +
- getContext().BuiltinInfo.getName(builtinID));
-return mlir::Value{};
+ case X86::BI__builtin_ia32_pblendd256: {
+uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
+unsigned numElts = cast(ops[0].getType()).getSize();
+
+int64_t indices[16];
Thibault-Monnier wrote:
Done, thanks.
https://github.com/llvm/llvm-project/pull/174236
___
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [CIR] Upstream CIR codegen for blend x86 builtins (PR #174236)
https://github.com/Thibault-Monnier updated
https://github.com/llvm/llvm-project/pull/174236
>From 855251d9f52f293aa18564f1b8f513728bf6a409 Mon Sep 17 00:00:00 2001
From: Thibault-Monnier
Date: Fri, 2 Jan 2026 20:25:17 +0100
Subject: [PATCH 1/2] Upstream CIR codegen for x86 blend builtins
---
clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp| 22 +--
.../CIR/CodeGenBuiltins/X86/avx-builtins.c| 24
.../CIR/CodeGenBuiltins/X86/avx2-builtins.c | 42 ++
.../CIR/CodeGenBuiltins/X86/sse41-builtins.c | 57 +--
4 files changed, 135 insertions(+), 10 deletions(-)
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 1c87e945de846..46de5bb5b9036 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -1275,6 +1275,10 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
const CallExpr *expr) {
}
case X86::BI__builtin_ia32_pmovqd512_mask:
case X86::BI__builtin_ia32_pmovwb512_mask:
+cgm.errorNYI(expr->getSourceRange(),
+ std::string("unimplemented X86 builtin call: ") +
+ getContext().BuiltinInfo.getName(builtinID));
+return mlir::Value{};
case X86::BI__builtin_ia32_pblendw128:
case X86::BI__builtin_ia32_blendpd:
case X86::BI__builtin_ia32_blendps:
@@ -1282,11 +1286,19 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
const CallExpr *expr) {
case X86::BI__builtin_ia32_blendps256:
case X86::BI__builtin_ia32_pblendw256:
case X86::BI__builtin_ia32_pblendd128:
- case X86::BI__builtin_ia32_pblendd256:
-cgm.errorNYI(expr->getSourceRange(),
- std::string("unimplemented X86 builtin call: ") +
- getContext().BuiltinInfo.getName(builtinID));
-return mlir::Value{};
+ case X86::BI__builtin_ia32_pblendd256: {
+uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
+unsigned numElts = cast(ops[0].getType()).getSize();
+
+int64_t indices[16];
+// If there are more than 8 elements, the immediate is used twice so make
+// sure we handle that.
+for (unsigned i = 0; i != numElts; ++i)
+ indices[i] = ((imm >> (i % 8)) & 0x1) ? numElts + i : i;
+
+return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1],
+ArrayRef(indices, numElts));
+ }
case X86::BI__builtin_ia32_pshuflw:
case X86::BI__builtin_ia32_pshuflw256:
case X86::BI__builtin_ia32_pshuflw512:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c
b/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c
index 0de782701ddc4..01ca55994ce50 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c
@@ -75,6 +75,30 @@ __m256i test_mm256_undefined_si256(void) {
return _mm256_undefined_si256();
}
+__m256d test_mm256_blend_pd(__m256d A, __m256d B) {
+ // CIR-LABEL: test_mm256_blend_pd
+ // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x
!cir.double>) [#cir.int<4> : !s32i, #cir.int<1> : !s32i, #cir.int<6> : !s32i,
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
+
+ // LLVM-LABEL: test_mm256_blend_pd
+ // LLVM: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32>
+
+ // OGCG-LABEL: test_mm256_blend_pd
+ // OGCG: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32>
+ return _mm256_blend_pd(A, B, 0x05);
+}
+
+__m256 test_mm256_blend_ps(__m256 A, __m256 B) {
+ // CIR-LABEL: test_mm256_blend_ps
+ // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x
!cir.float>) [#cir.int<8> : !s32i, #cir.int<1> : !s32i, #cir.int<10> : !s32i,
#cir.int<3> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<6> :
!s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.float>
+
+ // LLVM-LABEL: test_mm256_blend_ps
+ // LLVM: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32>
+
+ // OGCG-LABEL: test_mm256_blend_ps
+ // OGCG: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32>
+ return _mm256_blend_ps(A, B, 0x35);
+}
+
__m256d test_mm256_insertf128_pd(__m256d A, __m128d B) {
// CIR-LABEL: test_mm256_insertf128_pd
// %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x
!cir.double>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i,
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c
b/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c
index d4ed43ca1d26b..d3e54920ef186 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c
@@ -58,6 +58,48 @@ __m256i test1_mm256_inserti128_si256(__m256i a, __m128i b) {
return _mm256_inserti128_si256(a, b, 1);
}
+__m256i test_mm256_blend_epi16(__m256i a, __m256i b) {
+ // CIR-LABEL: _mm256_blend_epi16
+ // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}
[clang] [CIR] Upstream CIR codegen for blend x86 builtins (PR #174236)
@@ -1275,18 +1275,30 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
const CallExpr *expr) {
}
case X86::BI__builtin_ia32_pmovqd512_mask:
case X86::BI__builtin_ia32_pmovwb512_mask:
+cgm.errorNYI(expr->getSourceRange(),
+ std::string("unimplemented X86 builtin call: ") +
+ getContext().BuiltinInfo.getName(builtinID));
+return mlir::Value{};
case X86::BI__builtin_ia32_pblendw128:
case X86::BI__builtin_ia32_blendpd:
case X86::BI__builtin_ia32_blendps:
case X86::BI__builtin_ia32_blendpd256:
case X86::BI__builtin_ia32_blendps256:
case X86::BI__builtin_ia32_pblendw256:
case X86::BI__builtin_ia32_pblendd128:
- case X86::BI__builtin_ia32_pblendd256:
-cgm.errorNYI(expr->getSourceRange(),
- std::string("unimplemented X86 builtin call: ") +
- getContext().BuiltinInfo.getName(builtinID));
-return mlir::Value{};
+ case X86::BI__builtin_ia32_pblendd256: {
+uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
+unsigned numElts = cast(ops[0].getType()).getSize();
+
+int64_t indices[16];
andykaylor wrote:
Rather than populating an array of integers here, can you create a
`SmallVector` of `mlir::Attribute` and populate it with `cir::IntAttr` created
based on the value at line 1297? That avoids any concerns about overflowing the
number of elements, and it saves `createVecShuffle` from having to create such
a vector of attributes.
https://github.com/llvm/llvm-project/pull/174236
___
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [CIR] Upstream CIR codegen for blend x86 builtins (PR #174236)
llvmbot wrote:
@llvm/pr-subscribers-clang
Author: Thibault Monnier (Thibault-Monnier)
Changes
Part of https://github.com/llvm/llvm-project/issues/167752.
---
Full diff: https://github.com/llvm/llvm-project/pull/174236.diff
4 Files Affected:
- (modified) clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp (+17-5)
- (modified) clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c (+24)
- (modified) clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c (+42)
- (modified) clang/test/CIR/CodeGenBuiltins/X86/sse41-builtins.c (+52-5)
``diff
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 1c87e945de846..46de5bb5b9036 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -1275,6 +1275,10 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
const CallExpr *expr) {
}
case X86::BI__builtin_ia32_pmovqd512_mask:
case X86::BI__builtin_ia32_pmovwb512_mask:
+cgm.errorNYI(expr->getSourceRange(),
+ std::string("unimplemented X86 builtin call: ") +
+ getContext().BuiltinInfo.getName(builtinID));
+return mlir::Value{};
case X86::BI__builtin_ia32_pblendw128:
case X86::BI__builtin_ia32_blendpd:
case X86::BI__builtin_ia32_blendps:
@@ -1282,11 +1286,19 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
const CallExpr *expr) {
case X86::BI__builtin_ia32_blendps256:
case X86::BI__builtin_ia32_pblendw256:
case X86::BI__builtin_ia32_pblendd128:
- case X86::BI__builtin_ia32_pblendd256:
-cgm.errorNYI(expr->getSourceRange(),
- std::string("unimplemented X86 builtin call: ") +
- getContext().BuiltinInfo.getName(builtinID));
-return mlir::Value{};
+ case X86::BI__builtin_ia32_pblendd256: {
+uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
+unsigned numElts = cast(ops[0].getType()).getSize();
+
+int64_t indices[16];
+// If there are more than 8 elements, the immediate is used twice so make
+// sure we handle that.
+for (unsigned i = 0; i != numElts; ++i)
+ indices[i] = ((imm >> (i % 8)) & 0x1) ? numElts + i : i;
+
+return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1],
+ArrayRef(indices, numElts));
+ }
case X86::BI__builtin_ia32_pshuflw:
case X86::BI__builtin_ia32_pshuflw256:
case X86::BI__builtin_ia32_pshuflw512:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c
b/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c
index 0de782701ddc4..01ca55994ce50 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c
@@ -75,6 +75,30 @@ __m256i test_mm256_undefined_si256(void) {
return _mm256_undefined_si256();
}
+__m256d test_mm256_blend_pd(__m256d A, __m256d B) {
+ // CIR-LABEL: test_mm256_blend_pd
+ // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x
!cir.double>) [#cir.int<4> : !s32i, #cir.int<1> : !s32i, #cir.int<6> : !s32i,
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
+
+ // LLVM-LABEL: test_mm256_blend_pd
+ // LLVM: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32>
+
+ // OGCG-LABEL: test_mm256_blend_pd
+ // OGCG: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32>
+ return _mm256_blend_pd(A, B, 0x05);
+}
+
+__m256 test_mm256_blend_ps(__m256 A, __m256 B) {
+ // CIR-LABEL: test_mm256_blend_ps
+ // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x
!cir.float>) [#cir.int<8> : !s32i, #cir.int<1> : !s32i, #cir.int<10> : !s32i,
#cir.int<3> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<6> :
!s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.float>
+
+ // LLVM-LABEL: test_mm256_blend_ps
+ // LLVM: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32>
+
+ // OGCG-LABEL: test_mm256_blend_ps
+ // OGCG: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32>
+ return _mm256_blend_ps(A, B, 0x35);
+}
+
__m256d test_mm256_insertf128_pd(__m256d A, __m128d B) {
// CIR-LABEL: test_mm256_insertf128_pd
// %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x
!cir.double>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i,
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c
b/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c
index d4ed43ca1d26b..d3e54920ef186 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c
@@ -58,6 +58,48 @@ __m256i test1_mm256_inserti128_si256(__m256i a, __m128i b) {
return _mm256_inserti128_si256(a, b, 1);
}
+__m256i test_mm256_blend_epi16(__m256i a, __m256i b) {
+ // CIR-LABEL: _mm256_blend_epi16
+ // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x
!s16i>) [#cir.int<0> : !s32i, #cir.int<1
[clang] [CIR] Upstream CIR codegen for blend x86 builtins (PR #174236)
https://github.com/Thibault-Monnier created
https://github.com/llvm/llvm-project/pull/174236
Part of https://github.com/llvm/llvm-project/issues/167752.
>From 855251d9f52f293aa18564f1b8f513728bf6a409 Mon Sep 17 00:00:00 2001
From: Thibault-Monnier
Date: Fri, 2 Jan 2026 20:25:17 +0100
Subject: [PATCH] Upstream CIR codegen for x86 blend builtins
---
clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp| 22 +--
.../CIR/CodeGenBuiltins/X86/avx-builtins.c| 24
.../CIR/CodeGenBuiltins/X86/avx2-builtins.c | 42 ++
.../CIR/CodeGenBuiltins/X86/sse41-builtins.c | 57 +--
4 files changed, 135 insertions(+), 10 deletions(-)
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 1c87e945de846..46de5bb5b9036 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -1275,6 +1275,10 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
const CallExpr *expr) {
}
case X86::BI__builtin_ia32_pmovqd512_mask:
case X86::BI__builtin_ia32_pmovwb512_mask:
+cgm.errorNYI(expr->getSourceRange(),
+ std::string("unimplemented X86 builtin call: ") +
+ getContext().BuiltinInfo.getName(builtinID));
+return mlir::Value{};
case X86::BI__builtin_ia32_pblendw128:
case X86::BI__builtin_ia32_blendpd:
case X86::BI__builtin_ia32_blendps:
@@ -1282,11 +1286,19 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
const CallExpr *expr) {
case X86::BI__builtin_ia32_blendps256:
case X86::BI__builtin_ia32_pblendw256:
case X86::BI__builtin_ia32_pblendd128:
- case X86::BI__builtin_ia32_pblendd256:
-cgm.errorNYI(expr->getSourceRange(),
- std::string("unimplemented X86 builtin call: ") +
- getContext().BuiltinInfo.getName(builtinID));
-return mlir::Value{};
+ case X86::BI__builtin_ia32_pblendd256: {
+uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
+unsigned numElts = cast(ops[0].getType()).getSize();
+
+int64_t indices[16];
+// If there are more than 8 elements, the immediate is used twice so make
+// sure we handle that.
+for (unsigned i = 0; i != numElts; ++i)
+ indices[i] = ((imm >> (i % 8)) & 0x1) ? numElts + i : i;
+
+return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1],
+ArrayRef(indices, numElts));
+ }
case X86::BI__builtin_ia32_pshuflw:
case X86::BI__builtin_ia32_pshuflw256:
case X86::BI__builtin_ia32_pshuflw512:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c
b/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c
index 0de782701ddc4..01ca55994ce50 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx-builtins.c
@@ -75,6 +75,30 @@ __m256i test_mm256_undefined_si256(void) {
return _mm256_undefined_si256();
}
+__m256d test_mm256_blend_pd(__m256d A, __m256d B) {
+ // CIR-LABEL: test_mm256_blend_pd
+ // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x
!cir.double>) [#cir.int<4> : !s32i, #cir.int<1> : !s32i, #cir.int<6> : !s32i,
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
+
+ // LLVM-LABEL: test_mm256_blend_pd
+ // LLVM: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32>
+
+ // OGCG-LABEL: test_mm256_blend_pd
+ // OGCG: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32>
+ return _mm256_blend_pd(A, B, 0x05);
+}
+
+__m256 test_mm256_blend_ps(__m256 A, __m256 B) {
+ // CIR-LABEL: test_mm256_blend_ps
+ // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x
!cir.float>) [#cir.int<8> : !s32i, #cir.int<1> : !s32i, #cir.int<10> : !s32i,
#cir.int<3> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<6> :
!s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.float>
+
+ // LLVM-LABEL: test_mm256_blend_ps
+ // LLVM: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32>
+
+ // OGCG-LABEL: test_mm256_blend_ps
+ // OGCG: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32>
+ return _mm256_blend_ps(A, B, 0x35);
+}
+
__m256d test_mm256_insertf128_pd(__m256d A, __m128d B) {
// CIR-LABEL: test_mm256_insertf128_pd
// %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x
!cir.double>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i,
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c
b/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c
index d4ed43ca1d26b..d3e54920ef186 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c
@@ -58,6 +58,48 @@ __m256i test1_mm256_inserti128_si256(__m256i a, __m128i b) {
return _mm256_inserti128_si256(a, b, 1);
}
+__m256i test_mm256_blend_epi16(__m256i a, __m256i b) {
+ // CIR-LABEL: _mm256_blend_e
