[clang] [CIR] Upstream vec shuffle builtins in CIR codegen (PR #169178)

via cfe-commits Sat, 22 Nov 2025 09:43:57 -0800

llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-clang

Author: Thibault Monnier (Thibault-Monnier)

<details>
<summary>Changes</summary>

This PR is part of #<!-- -->167752. It upstreams the codegen and tests for the 
shuffle builtins implemented in the incubator, including:
- `vinsert` + `insert`
- `pblend` + `blend`
- `vpermilp`
- `pshuf` + `shufp`
- `palignr`

It does NOT upstream the `perm`, `vperm2`, `vpshuf`, `shuf_i` / `shuf_f` and 
`align` builtins, which are not yet implement in the incubator.

This _is_ a large commit, but most of it is tests.

The `pshufd` / `vpermilp` builtins seem to have no test coverage in the 
incubator, what should I do?

---

Patch is 72.93 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/169178.diff


15 Files Affected:

- (modified) clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp (+173-10) 
- (modified) clang/lib/CIR/CodeGen/CIRGenFunction.h (+23-1) 
- (modified) clang/test/CIR/CodeGen/X86/avx-builtins.c (+81) 
- (added) clang/test/CIR/CodeGen/X86/avx-shuffle-builtins.c (+90) 
- (added) clang/test/CIR/CodeGen/X86/avx2-builtins.c (+145) 
- (added) clang/test/CIR/CodeGen/X86/avx512bw-builtins.c (+53) 
- (added) clang/test/CIR/CodeGen/X86/avx512dq-builtins.c (+55) 
- (modified) clang/test/CIR/CodeGen/X86/avx512f-builtins.c (+69) 
- (added) clang/test/CIR/CodeGen/X86/avx512vl-builtins.c (+24) 
- (added) clang/test/CIR/CodeGen/X86/avx512vldq-builtins.c (+31) 
- (added) clang/test/CIR/CodeGen/X86/palignr.c (+32) 
- (modified) clang/test/CIR/CodeGen/X86/sse-builtins.c (+12) 
- (modified) clang/test/CIR/CodeGen/X86/sse2-builtins.c (+41-2) 
- (added) clang/test/CIR/CodeGen/X86/sse3-builtins.c (+21) 
- (added) clang/test/CIR/CodeGen/X86/sse41-builtins.c (+55) 


``````````diff
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 978fee7dbec9d..1cf38778b629f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -68,6 +68,35 @@ static mlir::Value emitVectorFCmp(CIRGenBuilderTy &builder,
   return bitCast;
 }
 
+static cir::VecShuffleOp emitPshufW(CIRGenFunction &cgf,
+                                    CIRGenBuilderTy &builder,
+                                    llvm::SmallVector<mlir::Value> &ops,
+                                    const CallExpr *expr, const bool isLow) {
+  uint32_t imm = cgf.getZExtIntValueFromConstOp(ops[1]);
+
+  auto vecTy = cast<cir::VectorType>(ops[0].getType());
+  unsigned numElts = vecTy.getSize();
+
+  unsigned firstHalfStart = isLow ? 0 : 4;
+  unsigned secondHalfStart = 4 - firstHalfStart;
+
+  // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+  imm = (imm & 0xff) * 0x01010101;
+
+  int64_t indices[32];
+  for (unsigned l = 0; l != numElts; l += 8) {
+    for (unsigned i = firstHalfStart; i != firstHalfStart + 4; ++i) {
+      indices[l + i] = l + (imm & 3) + firstHalfStart;
+      imm /= 4;
+    }
+    for (unsigned i = secondHalfStart; i != secondHalfStart + 4; ++i)
+      indices[l + i] = l + i;
+  }
+
+  return builder.createVecShuffle(cgf.getLoc(expr->getExprLoc()), ops[0],
+                                  ArrayRef(indices, numElts));
+}
+
 mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
                                                const CallExpr *expr) {
   if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -163,9 +192,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_vec_ext_v4di: {
     unsigned numElts = cast<cir::VectorType>(ops[0].getType()).getSize();
 
-    uint64_t index =
-        ops[1].getDefiningOp<cir::ConstantOp>().getIntValue().getZExtValue();
-
+    uint64_t index = getZExtIntValueFromConstOp(ops[1]);
     index &= numElts - 1;
 
     cir::ConstantOp indexVal =
@@ -497,6 +524,10 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_extracti64x2_256_mask:
   case X86::BI__builtin_ia32_extractf64x2_512_mask:
   case X86::BI__builtin_ia32_extracti64x2_512_mask:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_vinsertf128_pd256:
   case X86::BI__builtin_ia32_vinsertf128_ps256:
   case X86::BI__builtin_ia32_vinsertf128_si256:
@@ -512,9 +543,39 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_insertf64x2_256:
   case X86::BI__builtin_ia32_inserti64x2_256:
   case X86::BI__builtin_ia32_insertf64x2_512:
-  case X86::BI__builtin_ia32_inserti64x2_512:
+  case X86::BI__builtin_ia32_inserti64x2_512: {
+    unsigned dstNumElts = cast<cir::VectorType>(ops[0].getType()).getSize();
+    unsigned srcNumElts = cast<cir::VectorType>(ops[1].getType()).getSize();
+    unsigned subVectors = dstNumElts / srcNumElts;
+    assert(llvm::isPowerOf2_32(subVectors) && "Expected power of 2 
subvectors");
+
+    uint64_t index = getZExtIntValueFromConstOp(ops[2]);
+    index &= subVectors - 1; // Remove any extra bits.
+    index *= srcNumElts;
+
+    int64_t indices[16];
+    for (unsigned i = 0; i != dstNumElts; ++i)
+      indices[i] = (i >= srcNumElts) ? srcNumElts + (i % srcNumElts) : i;
+
+    mlir::Value op1 = builder.createVecShuffle(
+        getLoc(expr->getExprLoc()), ops[1], ArrayRef(indices, dstNumElts));
+
+    for (unsigned i = 0; i != dstNumElts; ++i) {
+      if (i >= index && i < (index + srcNumElts))
+        indices[i] = (i - index) + dstNumElts;
+      else
+        indices[i] = i;
+    }
+
+    return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], op1,
+                                    ArrayRef(indices, dstNumElts));
+  }
   case X86::BI__builtin_ia32_pmovqd512_mask:
   case X86::BI__builtin_ia32_pmovwb512_mask:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_pblendw128:
   case X86::BI__builtin_ia32_blendpd:
   case X86::BI__builtin_ia32_blendps:
@@ -522,13 +583,29 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_blendps256:
   case X86::BI__builtin_ia32_pblendw256:
   case X86::BI__builtin_ia32_pblendd128:
-  case X86::BI__builtin_ia32_pblendd256:
+  case X86::BI__builtin_ia32_pblendd256: {
+    uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
+    unsigned numElts = cast<cir::VectorType>(ops[0].getType()).getSize();
+
+    int64_t indices[16];
+    // If there are more than 8 elements, the immediate is used twice so make
+    // sure we handle that.
+    for (unsigned i = 0; i != numElts; ++i)
+      indices[i] = ((imm >> (i % 8)) & 0x1) ? numElts + i : i;
+
+    return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1],
+                                    ArrayRef(indices, numElts));
+  }
   case X86::BI__builtin_ia32_pshuflw:
   case X86::BI__builtin_ia32_pshuflw256:
-  case X86::BI__builtin_ia32_pshuflw512:
+  case X86::BI__builtin_ia32_pshuflw512: {
+    return emitPshufW(*this, builder, ops, expr, true);
+  }
   case X86::BI__builtin_ia32_pshufhw:
   case X86::BI__builtin_ia32_pshufhw256:
-  case X86::BI__builtin_ia32_pshufhw512:
+  case X86::BI__builtin_ia32_pshufhw512: {
+    return emitPshufW(*this, builder, ops, expr, false);
+  }
   case X86::BI__builtin_ia32_pshufd:
   case X86::BI__builtin_ia32_pshufd256:
   case X86::BI__builtin_ia32_pshufd512:
@@ -537,20 +614,106 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_vpermilpd256:
   case X86::BI__builtin_ia32_vpermilps256:
   case X86::BI__builtin_ia32_vpermilpd512:
-  case X86::BI__builtin_ia32_vpermilps512:
+  case X86::BI__builtin_ia32_vpermilps512: {
+    // TODO: Add tests for this branch.
+    uint32_t imm = getSExtIntValueFromConstOp(ops[1]);
+
+    auto vecTy = cast<cir::VectorType>(ops[0].getType());
+    unsigned numElts = vecTy.getSize();
+    auto eltTy = vecTy.getElementType();
+
+    unsigned eltBitWidth = getTypeSizeInBits(eltTy).getFixedValue();
+    unsigned numLaneElts = 128 / eltBitWidth;
+
+    // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+    imm = (imm & 0xff) * 0x01010101;
+
+    llvm::SmallVector<int64_t, 16> indices;
+    for (unsigned l = 0; l != numElts; l += numLaneElts) {
+      for (unsigned i = 0; i != numLaneElts; ++i) {
+        indices.push_back((imm % numLaneElts) + l);
+        imm /= numLaneElts;
+      }
+    }
+
+    return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0],
+                                    indices);
+  }
   case X86::BI__builtin_ia32_shufpd:
   case X86::BI__builtin_ia32_shufpd256:
   case X86::BI__builtin_ia32_shufpd512:
   case X86::BI__builtin_ia32_shufps:
   case X86::BI__builtin_ia32_shufps256:
-  case X86::BI__builtin_ia32_shufps512:
+  case X86::BI__builtin_ia32_shufps512: {
+    uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
+
+    auto vecTy = cast<cir::VectorType>(ops[0].getType());
+    unsigned numElts = vecTy.getSize();
+    unsigned numLanes = cgm.getDataLayout().getTypeSizeInBits(vecTy) / 128;
+    unsigned numLaneElts = numElts / numLanes;
+
+    // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+    imm = (imm & 0xff) * 0x01010101;
+
+    int64_t indices[16];
+    for (unsigned l = 0; l != numElts; l += numLaneElts) {
+      for (unsigned i = 0; i != numLaneElts; ++i) {
+        uint32_t idx = imm % numLaneElts;
+        imm /= numLaneElts;
+        if (i >= (numLaneElts / 2))
+          idx += numElts;
+        indices[l + i] = l + idx;
+      }
+    }
+
+    return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1],
+                                    ArrayRef(indices, numElts));
+  }
   case X86::BI__builtin_ia32_permdi256:
   case X86::BI__builtin_ia32_permdf256:
   case X86::BI__builtin_ia32_permdi512:
   case X86::BI__builtin_ia32_permdf512:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_palignr128:
   case X86::BI__builtin_ia32_palignr256:
-  case X86::BI__builtin_ia32_palignr512:
+  case X86::BI__builtin_ia32_palignr512: {
+    uint32_t shiftVal = getZExtIntValueFromConstOp(ops[2]) & 0xff;
+
+    unsigned numElts = cast<cir::VectorType>(ops[0].getType()).getSize();
+    assert(numElts % 16 == 0);
+
+    // If palignr is shifting the pair of vectors more than the size of two
+    // lanes, emit zero.
+    if (shiftVal >= 32)
+      return builder.getNullValue(convertType(expr->getType()),
+                                  getLoc(expr->getExprLoc()));
+
+    // If palignr is shifting the pair of input vectors more than one lane,
+    // but less than two lanes, convert to shifting in zeroes.
+    if (shiftVal > 16) {
+      shiftVal -= 16;
+      ops[1] = ops[0];
+      ops[0] =
+          builder.getNullValue(ops[0].getType(), getLoc(expr->getExprLoc()));
+    }
+
+    int64_t indices[64];
+    // 256-bit palignr operates on 128-bit lanes so we need to handle that
+    for (unsigned l = 0; l != numElts; l += 16) {
+      for (unsigned i = 0; i != 16; ++i) {
+        uint32_t idx = shiftVal + i;
+        if (idx >= 16)
+          idx += numElts - 16; // End of lane, switch operand.
+        indices[l + i] = l + idx;
+      }
+    }
+
+    return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[1], ops[0],
+                                    ArrayRef(indices, numElts));
+  }
   case X86::BI__builtin_ia32_alignd128:
   case X86::BI__builtin_ia32_alignd256:
   case X86::BI__builtin_ia32_alignd512:
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h 
b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index b426f3389ff1b..53920fbce7bde 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1349,6 +1349,28 @@ class CIRGenFunction : public CIRGenTypeCache {
                                     cir::IntType resType, mlir::Value emittedE,
                                     bool isDynamic);
 
+  /// Get integer from a mlir::Value that is an int constant or a constant op.
+  static int64_t getSExtIntValueFromConstOp(mlir::Value val) {
+    auto constOp = val.getDefiningOp<cir::ConstantOp>();
+    assert(constOp && "getIntValueFromConstOp call with non ConstantOp");
+    return constOp.getIntValue().getSExtValue();
+  }
+
+  /// Get zero-extended integer from a mlir::Value that is an int constant or a
+  /// constant op.
+  static int64_t getZExtIntValueFromConstOp(mlir::Value val) {
+    auto constOp = val.getDefiningOp<cir::ConstantOp>();
+    assert(constOp &&
+           "getZeroExtendedIntValueFromConstOp call with non ConstantOp");
+    return constOp.getIntValue().getZExtValue();
+  }
+
+  /// Get size of type in bits using SizedTypeInterface
+  llvm::TypeSize getTypeSizeInBits(mlir::Type ty) const {
+    assert(cir::isSized(Ty) && "Type must implement SizedTypeInterface");
+    return cgm.getDataLayout().getTypeSizeInBits(ty);
+  }
+
   mlir::Value evaluateOrEmitBuiltinObjectSize(const clang::Expr *e,
                                               unsigned type,
                                               cir::IntType resType,
@@ -1804,7 +1826,7 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   mlir::LogicalResult emitWhileStmt(const clang::WhileStmt &s);
 
-  mlir::Value emitX86BuiltinExpr(unsigned builtinID, const CallExpr *e);
+  mlir::Value emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr);
 
   /// Given an assignment `*lhs = rhs`, emit a test that checks if \p rhs is
   /// nonnull, if 1\p LHS is marked _Nonnull.
diff --git a/clang/test/CIR/CodeGen/X86/avx-builtins.c 
b/clang/test/CIR/CodeGen/X86/avx-builtins.c
index 82fa4358dc400..66c4e166971d2 100644
--- a/clang/test/CIR/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CIR/CodeGen/X86/avx-builtins.c
@@ -73,4 +73,85 @@ __m256i test_mm256_undefined_si256(void) {
   // OGCG-LABEL: test_mm256_undefined_si256
   // OGCG: ret <4 x i64> zeroinitializer
   return _mm256_undefined_si256();
+}
+
+__m256d test_mm256_blend_pd(__m256d A, __m256d B) {
+  // CIR-LABEL: test_mm256_blend_pd
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.double>) [#cir.int<4> : !s32i, #cir.int<1> : !s32i, #cir.int<6> : !s32i, 
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
+
+  // LLVM-LABEL: test_mm256_blend_pd
+  // LLVM: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
<i32 4, i32 1, i32 6, i32 3>
+
+  // OGCG-LABEL: test_mm256_blend_pd
+  // OGCG: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
<i32 4, i32 1, i32 6, i32 3>
+  return _mm256_blend_pd(A, B, 0x05);
+}
+
+__m256 test_mm256_blend_ps(__m256 A, __m256 B) {
+  // CIR-LABEL: test_mm256_blend_ps
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.float>) [#cir.int<8> : !s32i, #cir.int<1> : !s32i, #cir.int<10> : !s32i, 
#cir.int<3> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<6> : 
!s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.float>
+
+  // LLVM-LABEL: test_mm256_blend_ps
+  // LLVM: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
<i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
+
+  // OGCG-LABEL: test_mm256_blend_ps
+  // OGCG: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
<i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
+  return _mm256_blend_ps(A, B, 0x35);
+}
+
+__m256d test_mm256_insertf128_pd(__m256d A, __m128d B) {
+  // CIR-LABEL: test_mm256_insertf128_pd
+  // %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x 
!cir.double>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
+  // %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.double>) [#cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
+
+  // LLVM-LABEL: test_mm256_insertf128_pd
+  // LLVM: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <4 x i32> 
<i32 0, i32 1, i32 2, i32 3>
+  // LLVM: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
<i32 4, i32 5, i32 2, i32 3>
+  return _mm256_insertf128_pd(A, B, 0);
+}
+
+__m256 test_mm256_insertf128_ps(__m256 A, __m128 B) {
+  // CIR-LABEL: test_mm256_insertf128_ps
+  // %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) 
[#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : 
!s32i] : !cir.vector<8 x !cir.float>
+  // %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.float>) 
[#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : 
!s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, 
#cir.int<11> : !s32i] : !cir.vector<8 x !cir.float>
+
+  // LLVM-LABEL: test_mm256_insertf128_ps
+  // LLVM: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <8 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // LLVM: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  return _mm256_insertf128_ps(A, B, 1);
+}
+
+__m256i test_mm256_insertf128_si256(__m256i A, __m128i B) {
+  // CIR-LABEL: test_mm256_insertf128_si256
+  // %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) 
[#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : 
!s32i] : !cir.vector<8 x !s32i>
+  // %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s32i>) 
[#cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : 
!s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, 
#cir.int<7> : !s32i]
+
+  // LLVM-LABEL: test_mm256_insertf128_si256
+  // LLVM: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <8 x i32> <i32 
0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // LLVM: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 
8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+  return _mm256_insertf128_si256(A, B, 0);
+}
+
+__m256d test_mm256_shuffle_pd(__m256d A, __m256d B) {
+  // CIR-LABEL: test_mm256_shuffle_pd
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.double>) [#cir.int<0> : !s32i, #cir.int<4> : !s32i, #cir.int<2> : !s32i, 
#cir.int<6> : !s32i] : !cir.vector<4 x !cir.double>
+
+  // CHECK-LABEL: test_mm256_shuffle_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x 
i32> <i32 0, i32 4, i32 2, i32 6>
+
+  // OGCG-LABEL: test_mm256_shuffle_pd
+  // OGCG: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
<i32 0, i32 4, i32 2, i32 6>
+  return _mm256_shuffle_pd(A, B, 0);
+}
+
+__m256 test_mm256_shuffle_ps(__m256 A, __m256 B) {
+  // CIR-LABEL: test_mm256_shuffle_ps
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<8> : !s32i, 
#cir.int<8> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i, #cir.int<12> : 
!s32i, #cir.int<12> : !s32i] : !cir.vector<8 x !cir.float>
+
+  // CHECK-LABEL: test_mm256_shuffle_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
<i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
+
+  // OGCG-LABEL: test_mm256_shuffle_ps
+  // OGCG: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
<i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
+  return _mm256_shuffle_ps(A, B, 0);
 }
\ No newline at end of file
diff --git a/clang/test/CIR/CodeGen/X86/avx-shuffle-builtins.c 
b/clang/test/CIR/CodeGen/X86/avx-shuffle-builtins.c
new file mode 100644
index 0000000000000..6384dcd0973fa
--- /dev/null
+++ b/clang/test/CIR/CodeGen/X86/avx-shuffle-builtins.c
@@ -0,0 +1,90 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 5
+// REQUIRES: x86-registered-target
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-unknown-linux 
-target-feature +avx -disable-O0-optnone -fclangir -emit-cir -o %t.cir | opt -S 
-passes=mem2reg
+// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-unknown-linux 
-target-feature +avx -disable-O0-optnone -fclangir -emit-llvm -o %t.ll | opt -S 
-passes=mem2reg
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+#include <immintrin.h>
+
+// CIR-LABEL: @test_mm256_insertf128_pd_0(
+// CIR: [[A:%.*]] = cir.load align(32) %0 : !cir.ptr<!cir.vector<4 x 
!cir.double>>, !cir.vector<4 x !cir.double>
+// CIR: [[B:%.*]] = cir.load align(16) %1 : !cir.ptr<!cir.vector<2 x 
!cir.double>>, !cir.vector<2 x !cir.double>
+// CIR: %{{.*}} = cir.vec.shuffle([[B]], %{{.*}} : !cir.vector<2 x 
!cir.double>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
+// CIR-NEXT: %{{.*}} = cir.vec.shuffle([[A]], %{{.*}} : !s32i, #cir.int<5> : 
!s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
+// CIR: cir.return %{{.*}} : !cir.vector<4 x !cir.double>
+
+// LLVM-LABEL: @test_mm256_insertf128_pd_0
+// LLVM:    [[A:%.*]] = load <4 x double>, ptr %{{.*}}, align 32
+// LLVM:    [[B:%.*]] = load <2 x double>, ptr %{{.*}}, align 16
+// LLVM-NEXT:    [[WIDEN:%.*]] = shufflevector <2 x double> [[B]], <2 x 
double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// LLVM-NEXT:    [[INSERT:%.*]] = shufflevector <4 x double> [[A]], <4 x 
double> [[WIDEN]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+// LLVM:    ret <4 x double>
+__m256d test_mm256_insertf128_pd_0(__m256d a, __m128d b) {
+  return _mm256_insertf128_pd(a, b, 0);
+}
+
+// CIR-LABEL: @test_mm256_insertf128_ps_0(
+// CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.float>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/169178
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [CIR] Upstream vec shuffle builtins in CIR codegen (PR #169178)

Reply via email to