[clang] [CIR] Upstream vec shuffle builtins in CIR codegen (PR #169178)

Thibault Monnier via cfe-commits Sat, 22 Nov 2025 09:44:01 -0800

https://github.com/Thibault-Monnier created 
https://github.com/llvm/llvm-project/pull/169178


This PR is part of #167752. It upstreams the codegen and tests for the shuffle 
builtins implemented in the incubator, including:
- `vinsert` + `insert`
- `pblend` + `blend`
- `vpermilp`
- `pshuf` + `shufp`
- `palignr`

It does NOT upstream the `perm`, `vperm2`, `vpshuf`, `shuf_i` / `shuf_f` and 
`align` builtins, which are not yet implement in the incubator.

This _is_ a large commit, but most of it is tests.

The `pshufd` / `vpermilp` builtins seem to have no test coverage in the 
incubator, what should I do?

>From c76fc2db63d8707217cb9c430e58d01153dd8cba Mon Sep 17 00:00:00 2001
From: Thibault-Monnier <[email protected]>
Date: Sat, 22 Nov 2025 18:15:23 +0100
Subject: [PATCH] Upstream vec shuffle builtins in CIR codegen

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    | 183 +++++++++++++++++-
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |  24 ++-
 clang/test/CIR/CodeGen/X86/avx-builtins.c     |  81 ++++++++
 .../CIR/CodeGen/X86/avx-shuffle-builtins.c    |  90 +++++++++
 clang/test/CIR/CodeGen/X86/avx2-builtins.c    | 145 ++++++++++++++
 .../test/CIR/CodeGen/X86/avx512bw-builtins.c  |  53 +++++
 .../test/CIR/CodeGen/X86/avx512dq-builtins.c  |  55 ++++++
 clang/test/CIR/CodeGen/X86/avx512f-builtins.c |  69 +++++++
 .../test/CIR/CodeGen/X86/avx512vl-builtins.c  |  24 +++
 .../CIR/CodeGen/X86/avx512vldq-builtins.c     |  31 +++
 clang/test/CIR/CodeGen/X86/palignr.c          |  32 +++
 clang/test/CIR/CodeGen/X86/sse-builtins.c     |  12 ++
 clang/test/CIR/CodeGen/X86/sse2-builtins.c    |  43 +++-
 clang/test/CIR/CodeGen/X86/sse3-builtins.c    |  21 ++
 clang/test/CIR/CodeGen/X86/sse41-builtins.c   |  55 ++++++
 15 files changed, 905 insertions(+), 13 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/X86/avx-shuffle-builtins.c
 create mode 100644 clang/test/CIR/CodeGen/X86/avx2-builtins.c
 create mode 100644 clang/test/CIR/CodeGen/X86/avx512bw-builtins.c
 create mode 100644 clang/test/CIR/CodeGen/X86/avx512dq-builtins.c
 create mode 100644 clang/test/CIR/CodeGen/X86/avx512vl-builtins.c
 create mode 100644 clang/test/CIR/CodeGen/X86/avx512vldq-builtins.c
 create mode 100644 clang/test/CIR/CodeGen/X86/palignr.c
 create mode 100644 clang/test/CIR/CodeGen/X86/sse3-builtins.c
 create mode 100644 clang/test/CIR/CodeGen/X86/sse41-builtins.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 978fee7dbec9d..1cf38778b629f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -68,6 +68,35 @@ static mlir::Value emitVectorFCmp(CIRGenBuilderTy &builder,
   return bitCast;
 }
 
+static cir::VecShuffleOp emitPshufW(CIRGenFunction &cgf,
+                                    CIRGenBuilderTy &builder,
+                                    llvm::SmallVector<mlir::Value> &ops,
+                                    const CallExpr *expr, const bool isLow) {
+  uint32_t imm = cgf.getZExtIntValueFromConstOp(ops[1]);
+
+  auto vecTy = cast<cir::VectorType>(ops[0].getType());
+  unsigned numElts = vecTy.getSize();
+
+  unsigned firstHalfStart = isLow ? 0 : 4;
+  unsigned secondHalfStart = 4 - firstHalfStart;
+
+  // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+  imm = (imm & 0xff) * 0x01010101;
+
+  int64_t indices[32];
+  for (unsigned l = 0; l != numElts; l += 8) {
+    for (unsigned i = firstHalfStart; i != firstHalfStart + 4; ++i) {
+      indices[l + i] = l + (imm & 3) + firstHalfStart;
+      imm /= 4;
+    }
+    for (unsigned i = secondHalfStart; i != secondHalfStart + 4; ++i)
+      indices[l + i] = l + i;
+  }
+
+  return builder.createVecShuffle(cgf.getLoc(expr->getExprLoc()), ops[0],
+                                  ArrayRef(indices, numElts));
+}
+
 mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
                                                const CallExpr *expr) {
   if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -163,9 +192,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_vec_ext_v4di: {
     unsigned numElts = cast<cir::VectorType>(ops[0].getType()).getSize();
 
-    uint64_t index =
-        ops[1].getDefiningOp<cir::ConstantOp>().getIntValue().getZExtValue();
-
+    uint64_t index = getZExtIntValueFromConstOp(ops[1]);
     index &= numElts - 1;
 
     cir::ConstantOp indexVal =
@@ -497,6 +524,10 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_extracti64x2_256_mask:
   case X86::BI__builtin_ia32_extractf64x2_512_mask:
   case X86::BI__builtin_ia32_extracti64x2_512_mask:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_vinsertf128_pd256:
   case X86::BI__builtin_ia32_vinsertf128_ps256:
   case X86::BI__builtin_ia32_vinsertf128_si256:
@@ -512,9 +543,39 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_insertf64x2_256:
   case X86::BI__builtin_ia32_inserti64x2_256:
   case X86::BI__builtin_ia32_insertf64x2_512:
-  case X86::BI__builtin_ia32_inserti64x2_512:
+  case X86::BI__builtin_ia32_inserti64x2_512: {
+    unsigned dstNumElts = cast<cir::VectorType>(ops[0].getType()).getSize();
+    unsigned srcNumElts = cast<cir::VectorType>(ops[1].getType()).getSize();
+    unsigned subVectors = dstNumElts / srcNumElts;
+    assert(llvm::isPowerOf2_32(subVectors) && "Expected power of 2 
subvectors");
+
+    uint64_t index = getZExtIntValueFromConstOp(ops[2]);
+    index &= subVectors - 1; // Remove any extra bits.
+    index *= srcNumElts;
+
+    int64_t indices[16];
+    for (unsigned i = 0; i != dstNumElts; ++i)
+      indices[i] = (i >= srcNumElts) ? srcNumElts + (i % srcNumElts) : i;
+
+    mlir::Value op1 = builder.createVecShuffle(
+        getLoc(expr->getExprLoc()), ops[1], ArrayRef(indices, dstNumElts));
+
+    for (unsigned i = 0; i != dstNumElts; ++i) {
+      if (i >= index && i < (index + srcNumElts))
+        indices[i] = (i - index) + dstNumElts;
+      else
+        indices[i] = i;
+    }
+
+    return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], op1,
+                                    ArrayRef(indices, dstNumElts));
+  }
   case X86::BI__builtin_ia32_pmovqd512_mask:
   case X86::BI__builtin_ia32_pmovwb512_mask:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_pblendw128:
   case X86::BI__builtin_ia32_blendpd:
   case X86::BI__builtin_ia32_blendps:
@@ -522,13 +583,29 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_blendps256:
   case X86::BI__builtin_ia32_pblendw256:
   case X86::BI__builtin_ia32_pblendd128:
-  case X86::BI__builtin_ia32_pblendd256:
+  case X86::BI__builtin_ia32_pblendd256: {
+    uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
+    unsigned numElts = cast<cir::VectorType>(ops[0].getType()).getSize();
+
+    int64_t indices[16];
+    // If there are more than 8 elements, the immediate is used twice so make
+    // sure we handle that.
+    for (unsigned i = 0; i != numElts; ++i)
+      indices[i] = ((imm >> (i % 8)) & 0x1) ? numElts + i : i;
+
+    return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1],
+                                    ArrayRef(indices, numElts));
+  }
   case X86::BI__builtin_ia32_pshuflw:
   case X86::BI__builtin_ia32_pshuflw256:
-  case X86::BI__builtin_ia32_pshuflw512:
+  case X86::BI__builtin_ia32_pshuflw512: {
+    return emitPshufW(*this, builder, ops, expr, true);
+  }
   case X86::BI__builtin_ia32_pshufhw:
   case X86::BI__builtin_ia32_pshufhw256:
-  case X86::BI__builtin_ia32_pshufhw512:
+  case X86::BI__builtin_ia32_pshufhw512: {
+    return emitPshufW(*this, builder, ops, expr, false);
+  }
   case X86::BI__builtin_ia32_pshufd:
   case X86::BI__builtin_ia32_pshufd256:
   case X86::BI__builtin_ia32_pshufd512:
@@ -537,20 +614,106 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_vpermilpd256:
   case X86::BI__builtin_ia32_vpermilps256:
   case X86::BI__builtin_ia32_vpermilpd512:
-  case X86::BI__builtin_ia32_vpermilps512:
+  case X86::BI__builtin_ia32_vpermilps512: {
+    // TODO: Add tests for this branch.
+    uint32_t imm = getSExtIntValueFromConstOp(ops[1]);
+
+    auto vecTy = cast<cir::VectorType>(ops[0].getType());
+    unsigned numElts = vecTy.getSize();
+    auto eltTy = vecTy.getElementType();
+
+    unsigned eltBitWidth = getTypeSizeInBits(eltTy).getFixedValue();
+    unsigned numLaneElts = 128 / eltBitWidth;
+
+    // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+    imm = (imm & 0xff) * 0x01010101;
+
+    llvm::SmallVector<int64_t, 16> indices;
+    for (unsigned l = 0; l != numElts; l += numLaneElts) {
+      for (unsigned i = 0; i != numLaneElts; ++i) {
+        indices.push_back((imm % numLaneElts) + l);
+        imm /= numLaneElts;
+      }
+    }
+
+    return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0],
+                                    indices);
+  }
   case X86::BI__builtin_ia32_shufpd:
   case X86::BI__builtin_ia32_shufpd256:
   case X86::BI__builtin_ia32_shufpd512:
   case X86::BI__builtin_ia32_shufps:
   case X86::BI__builtin_ia32_shufps256:
-  case X86::BI__builtin_ia32_shufps512:
+  case X86::BI__builtin_ia32_shufps512: {
+    uint32_t imm = getZExtIntValueFromConstOp(ops[2]);
+
+    auto vecTy = cast<cir::VectorType>(ops[0].getType());
+    unsigned numElts = vecTy.getSize();
+    unsigned numLanes = cgm.getDataLayout().getTypeSizeInBits(vecTy) / 128;
+    unsigned numLaneElts = numElts / numLanes;
+
+    // Splat the 8-bits of immediate 4 times to help the loop wrap around.
+    imm = (imm & 0xff) * 0x01010101;
+
+    int64_t indices[16];
+    for (unsigned l = 0; l != numElts; l += numLaneElts) {
+      for (unsigned i = 0; i != numLaneElts; ++i) {
+        uint32_t idx = imm % numLaneElts;
+        imm /= numLaneElts;
+        if (i >= (numLaneElts / 2))
+          idx += numElts;
+        indices[l + i] = l + idx;
+      }
+    }
+
+    return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[0], ops[1],
+                                    ArrayRef(indices, numElts));
+  }
   case X86::BI__builtin_ia32_permdi256:
   case X86::BI__builtin_ia32_permdf256:
   case X86::BI__builtin_ia32_permdi512:
   case X86::BI__builtin_ia32_permdf512:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_palignr128:
   case X86::BI__builtin_ia32_palignr256:
-  case X86::BI__builtin_ia32_palignr512:
+  case X86::BI__builtin_ia32_palignr512: {
+    uint32_t shiftVal = getZExtIntValueFromConstOp(ops[2]) & 0xff;
+
+    unsigned numElts = cast<cir::VectorType>(ops[0].getType()).getSize();
+    assert(numElts % 16 == 0);
+
+    // If palignr is shifting the pair of vectors more than the size of two
+    // lanes, emit zero.
+    if (shiftVal >= 32)
+      return builder.getNullValue(convertType(expr->getType()),
+                                  getLoc(expr->getExprLoc()));
+
+    // If palignr is shifting the pair of input vectors more than one lane,
+    // but less than two lanes, convert to shifting in zeroes.
+    if (shiftVal > 16) {
+      shiftVal -= 16;
+      ops[1] = ops[0];
+      ops[0] =
+          builder.getNullValue(ops[0].getType(), getLoc(expr->getExprLoc()));
+    }
+
+    int64_t indices[64];
+    // 256-bit palignr operates on 128-bit lanes so we need to handle that
+    for (unsigned l = 0; l != numElts; l += 16) {
+      for (unsigned i = 0; i != 16; ++i) {
+        uint32_t idx = shiftVal + i;
+        if (idx >= 16)
+          idx += numElts - 16; // End of lane, switch operand.
+        indices[l + i] = l + idx;
+      }
+    }
+
+    return builder.createVecShuffle(getLoc(expr->getExprLoc()), ops[1], ops[0],
+                                    ArrayRef(indices, numElts));
+  }
   case X86::BI__builtin_ia32_alignd128:
   case X86::BI__builtin_ia32_alignd256:
   case X86::BI__builtin_ia32_alignd512:
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h 
b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index b426f3389ff1b..53920fbce7bde 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1349,6 +1349,28 @@ class CIRGenFunction : public CIRGenTypeCache {
                                     cir::IntType resType, mlir::Value emittedE,
                                     bool isDynamic);
 
+  /// Get integer from a mlir::Value that is an int constant or a constant op.
+  static int64_t getSExtIntValueFromConstOp(mlir::Value val) {
+    auto constOp = val.getDefiningOp<cir::ConstantOp>();
+    assert(constOp && "getIntValueFromConstOp call with non ConstantOp");
+    return constOp.getIntValue().getSExtValue();
+  }
+
+  /// Get zero-extended integer from a mlir::Value that is an int constant or a
+  /// constant op.
+  static int64_t getZExtIntValueFromConstOp(mlir::Value val) {
+    auto constOp = val.getDefiningOp<cir::ConstantOp>();
+    assert(constOp &&
+           "getZeroExtendedIntValueFromConstOp call with non ConstantOp");
+    return constOp.getIntValue().getZExtValue();
+  }
+
+  /// Get size of type in bits using SizedTypeInterface
+  llvm::TypeSize getTypeSizeInBits(mlir::Type ty) const {
+    assert(cir::isSized(Ty) && "Type must implement SizedTypeInterface");
+    return cgm.getDataLayout().getTypeSizeInBits(ty);
+  }
+
   mlir::Value evaluateOrEmitBuiltinObjectSize(const clang::Expr *e,
                                               unsigned type,
                                               cir::IntType resType,
@@ -1804,7 +1826,7 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   mlir::LogicalResult emitWhileStmt(const clang::WhileStmt &s);
 
-  mlir::Value emitX86BuiltinExpr(unsigned builtinID, const CallExpr *e);
+  mlir::Value emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr);
 
   /// Given an assignment `*lhs = rhs`, emit a test that checks if \p rhs is
   /// nonnull, if 1\p LHS is marked _Nonnull.
diff --git a/clang/test/CIR/CodeGen/X86/avx-builtins.c 
b/clang/test/CIR/CodeGen/X86/avx-builtins.c
index 82fa4358dc400..66c4e166971d2 100644
--- a/clang/test/CIR/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CIR/CodeGen/X86/avx-builtins.c
@@ -73,4 +73,85 @@ __m256i test_mm256_undefined_si256(void) {
   // OGCG-LABEL: test_mm256_undefined_si256
   // OGCG: ret <4 x i64> zeroinitializer
   return _mm256_undefined_si256();
+}
+
+__m256d test_mm256_blend_pd(__m256d A, __m256d B) {
+  // CIR-LABEL: test_mm256_blend_pd
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.double>) [#cir.int<4> : !s32i, #cir.int<1> : !s32i, #cir.int<6> : !s32i, 
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
+
+  // LLVM-LABEL: test_mm256_blend_pd
+  // LLVM: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
<i32 4, i32 1, i32 6, i32 3>
+
+  // OGCG-LABEL: test_mm256_blend_pd
+  // OGCG: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
<i32 4, i32 1, i32 6, i32 3>
+  return _mm256_blend_pd(A, B, 0x05);
+}
+
+__m256 test_mm256_blend_ps(__m256 A, __m256 B) {
+  // CIR-LABEL: test_mm256_blend_ps
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.float>) [#cir.int<8> : !s32i, #cir.int<1> : !s32i, #cir.int<10> : !s32i, 
#cir.int<3> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<6> : 
!s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.float>
+
+  // LLVM-LABEL: test_mm256_blend_ps
+  // LLVM: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
<i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
+
+  // OGCG-LABEL: test_mm256_blend_ps
+  // OGCG: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
<i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
+  return _mm256_blend_ps(A, B, 0x35);
+}
+
+__m256d test_mm256_insertf128_pd(__m256d A, __m128d B) {
+  // CIR-LABEL: test_mm256_insertf128_pd
+  // %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x 
!cir.double>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
+  // %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.double>) [#cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
+
+  // LLVM-LABEL: test_mm256_insertf128_pd
+  // LLVM: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <4 x i32> 
<i32 0, i32 1, i32 2, i32 3>
+  // LLVM: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
<i32 4, i32 5, i32 2, i32 3>
+  return _mm256_insertf128_pd(A, B, 0);
+}
+
+__m256 test_mm256_insertf128_ps(__m256 A, __m128 B) {
+  // CIR-LABEL: test_mm256_insertf128_ps
+  // %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) 
[#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : 
!s32i] : !cir.vector<8 x !cir.float>
+  // %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.float>) 
[#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : 
!s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, 
#cir.int<11> : !s32i] : !cir.vector<8 x !cir.float>
+
+  // LLVM-LABEL: test_mm256_insertf128_ps
+  // LLVM: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <8 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // LLVM: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  return _mm256_insertf128_ps(A, B, 1);
+}
+
+__m256i test_mm256_insertf128_si256(__m256i A, __m128i B) {
+  // CIR-LABEL: test_mm256_insertf128_si256
+  // %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) 
[#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : 
!s32i] : !cir.vector<8 x !s32i>
+  // %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s32i>) 
[#cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : 
!s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, 
#cir.int<7> : !s32i]
+
+  // LLVM-LABEL: test_mm256_insertf128_si256
+  // LLVM: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <8 x i32> <i32 
0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // LLVM: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 
8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+  return _mm256_insertf128_si256(A, B, 0);
+}
+
+__m256d test_mm256_shuffle_pd(__m256d A, __m256d B) {
+  // CIR-LABEL: test_mm256_shuffle_pd
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.double>) [#cir.int<0> : !s32i, #cir.int<4> : !s32i, #cir.int<2> : !s32i, 
#cir.int<6> : !s32i] : !cir.vector<4 x !cir.double>
+
+  // CHECK-LABEL: test_mm256_shuffle_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x 
i32> <i32 0, i32 4, i32 2, i32 6>
+
+  // OGCG-LABEL: test_mm256_shuffle_pd
+  // OGCG: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
<i32 0, i32 4, i32 2, i32 6>
+  return _mm256_shuffle_pd(A, B, 0);
+}
+
+__m256 test_mm256_shuffle_ps(__m256 A, __m256 B) {
+  // CIR-LABEL: test_mm256_shuffle_ps
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<8> : !s32i, 
#cir.int<8> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i, #cir.int<12> : 
!s32i, #cir.int<12> : !s32i] : !cir.vector<8 x !cir.float>
+
+  // CHECK-LABEL: test_mm256_shuffle_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
<i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
+
+  // OGCG-LABEL: test_mm256_shuffle_ps
+  // OGCG: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
<i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
+  return _mm256_shuffle_ps(A, B, 0);
 }
\ No newline at end of file
diff --git a/clang/test/CIR/CodeGen/X86/avx-shuffle-builtins.c 
b/clang/test/CIR/CodeGen/X86/avx-shuffle-builtins.c
new file mode 100644
index 0000000000000..6384dcd0973fa
--- /dev/null
+++ b/clang/test/CIR/CodeGen/X86/avx-shuffle-builtins.c
@@ -0,0 +1,90 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 5
+// REQUIRES: x86-registered-target
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-unknown-linux 
-target-feature +avx -disable-O0-optnone -fclangir -emit-cir -o %t.cir | opt -S 
-passes=mem2reg
+// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-unknown-linux 
-target-feature +avx -disable-O0-optnone -fclangir -emit-llvm -o %t.ll | opt -S 
-passes=mem2reg
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+#include <immintrin.h>
+
+// CIR-LABEL: @test_mm256_insertf128_pd_0(
+// CIR: [[A:%.*]] = cir.load align(32) %0 : !cir.ptr<!cir.vector<4 x 
!cir.double>>, !cir.vector<4 x !cir.double>
+// CIR: [[B:%.*]] = cir.load align(16) %1 : !cir.ptr<!cir.vector<2 x 
!cir.double>>, !cir.vector<2 x !cir.double>
+// CIR: %{{.*}} = cir.vec.shuffle([[B]], %{{.*}} : !cir.vector<2 x 
!cir.double>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
+// CIR-NEXT: %{{.*}} = cir.vec.shuffle([[A]], %{{.*}} : !s32i, #cir.int<5> : 
!s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.double>
+// CIR: cir.return %{{.*}} : !cir.vector<4 x !cir.double>
+
+// LLVM-LABEL: @test_mm256_insertf128_pd_0
+// LLVM:    [[A:%.*]] = load <4 x double>, ptr %{{.*}}, align 32
+// LLVM:    [[B:%.*]] = load <2 x double>, ptr %{{.*}}, align 16
+// LLVM-NEXT:    [[WIDEN:%.*]] = shufflevector <2 x double> [[B]], <2 x 
double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// LLVM-NEXT:    [[INSERT:%.*]] = shufflevector <4 x double> [[A]], <4 x 
double> [[WIDEN]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+// LLVM:    ret <4 x double>
+__m256d test_mm256_insertf128_pd_0(__m256d a, __m128d b) {
+  return _mm256_insertf128_pd(a, b, 0);
+}
+
+// CIR-LABEL: @test_mm256_insertf128_ps_0(
+// CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.float>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : 
!s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.float>
+// CIR-NEXT: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.float>) [#cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, 
#cir.int<11> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : 
!s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.float>
+// CIR: cir.return %{{.*}} : !cir.vector<8 x !cir.float>
+
+// LLVM-LABEL: @test_mm256_insertf128_ps_0(
+// LLVM:    %{{.*}} = shufflevector <4 x float> %{{.*}}, <4 x float> poison, 
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// LLVM-NEXT:    %{{.*}} = shufflevector <8 x float> %{{.*}}, <8 x float> 
%{{.*}}, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+// LLVM:    ret <8 x float> %{{.*}}
+__m256 test_mm256_insertf128_ps_0(__m256 a, __m128 b) {
+  return _mm256_insertf128_ps(a, b, 0);
+}
+
+// CIR-LABEL: @test_mm256_insertf128_ps_1(
+// CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.float>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : 
!s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.float>
+// CIR-NEXT: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.float>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : 
!s32i, #cir.int<11> : !s32i] : !cir.vector<8 x !cir.float>
+// CIR: cir.return %{{.*}} : !cir.vector<8 x !cir.float>
+
+// LLVM-LABEL: define dso_local <8 x float> @test_mm256_insertf128_ps_1(
+// LLVM:    %{{.*}} = shufflevector <4 x float> %{{.*}}, <4 x float> poison, 
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// LLVM-NEXT:    %{{.*}} = shufflevector <8 x float> %{{.*}}, <8 x float> 
%{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+// LLVM:    ret <8 x float> %{{.*}}
+__m256 test_mm256_insertf128_ps_1(__m256 a, __m128 b) {
+  return _mm256_insertf128_ps(a, b, 1);
+}
+
+// CIR-LABEL: @test_mm256_insertf128_si256_0(
+// CIR: [[TMP0:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<4 x !s64i> -> 
!cir.vector<8 x !s32i>
+// CIR: [[TMP1:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<2 x !s64i> -> 
!cir.vector<4 x !s32i>
+// CIR: %{{.*}} = cir.vec.shuffle([[TMP1]], %{{.*}} : !cir.vector<4 x !s32i>) 
[#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : 
!s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, 
#cir.int<7> : !s32i] : !cir.vector<8 x !s32i>
+// CIR-NEXT: %{{.*}} = cir.vec.shuffle([[TMP0]], %{{.*}} : !cir.vector<8 x 
!s32i>) [#cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, 
#cir.int<11> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : 
!s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !s32i>
+// CIR: %{{.*}} = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s32i> -> 
!cir.vector<4 x !s64i>
+// CIR: cir.return %{{.*}} : !cir.vector<4 x !s64i>
+
+// LLVM-LABEL: @test_mm256_insertf128_si256_0
+// LLVM:    [[TMP0:%.*]] = bitcast <4 x i64> %{{.*}} to <8 x i32>
+// LLVM:    [[TMP1:%.*]] = bitcast <2 x i64> %{{.*}} to <4 x i32>
+// LLVM:    [[WIDEN:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> 
poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// LLVM-NEXT:    [[INSERT:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> 
[[WIDEN]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+// LLVM:    [[TMP2:%.*]] = bitcast <8 x i32> [[INSERT]] to <4 x i64>
+// LLVM:    ret <4 x i64> %{{.*}}
+__m256i test_mm256_insertf128_si256_0(__m256i a, __m128i b) {
+  return _mm256_insertf128_si256(a, b, 0);
+}
+
+// CIR-LABEL: @test_mm256_insertf128_si256_1(
+// CIR: [[TMP0:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<4 x !s64i> -> 
!cir.vector<8 x !s32i>
+// CIR: [[TMP1:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<2 x !s64i> -> 
!cir.vector<4 x !s32i>
+// CIR: %{{.*}} = cir.vec.shuffle([[TMP1]], %{{.*}} : !cir.vector<4 x !s32i>) 
[#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : 
!s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, 
#cir.int<7> : !s32i] : !cir.vector<8 x !s32i>
+// CIR-NEXT: %{{.*}} = cir.vec.shuffle([[TMP0]], %{{.*}} : !cir.vector<8 x 
!s32i>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : 
!s32i, #cir.int<11> : !s32i] : !cir.vector<8 x !s32i>
+// CIR: %{{.*}} = cir.cast bitcast %{{.*}} : !cir.vector<8 x !s32i> -> 
!cir.vector<4 x !s64i>
+// CIR: cir.return %{{.*}} : !cir.vector<4 x !s64i>
+
+// LLVM-LABEL: @test_mm256_insertf128_si256_1
+// LLVM:    [[TMP0:%.*]] = bitcast <4 x i64> %{{.*}} to <8 x i32>
+// LLVM:    [[TMP1:%.*]] = bitcast <2 x i64> %{{.*}} to <4 x i32>
+// LLVM:    [[WIDEN:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> 
poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// LLVM-NEXT:    [[INSERT:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> 
[[WIDEN]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+// LLVM:    [[TMP2:%.*]] = bitcast <8 x i32> [[INSERT]] to <4 x i64>
+// LLVM:    ret <4 x i64> %{{.*}}
+__m256i test_mm256_insertf128_si256_1(__m256i a, __m128i b) {
+  return _mm256_insertf128_si256(a, b, 1);
+}
\ No newline at end of file
diff --git a/clang/test/CIR/CodeGen/X86/avx2-builtins.c 
b/clang/test/CIR/CodeGen/X86/avx2-builtins.c
new file mode 100644
index 0000000000000..aeea45c6e0446
--- /dev/null
+++ b/clang/test/CIR/CodeGen/X86/avx2-builtins.c
@@ -0,0 +1,145 @@
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-cir -o 
%t.cir -Wall -Werror
+// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx2 -fno-signed-char -fclangir 
-emit-cir -o %t.cir -Wall -Werror
+// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-llvm -o 
%t.ll -Wall -Werror
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx2 -fno-signed-char -fclangir 
-emit-llvm -o %t.ll -Wall -Werror
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-cir -o 
%t.cir -Wall -Werror
+// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx2 -fno-signed-char -fclangir 
-emit-cir -o %t.cir -Wall -Werror
+// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-llvm -o 
%t.ll -Wall -Werror
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx2 -fno-signed-char -fclangir 
-emit-llvm -o %t.ll -Wall -Werror
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror 
| FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +avx2 -fno-signed-char -emit-llvm 
-o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror 
| FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +avx2 -fno-signed-char -emit-llvm 
-o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
+
+// This test mimics clang/test/CodeGen/X86/avx2-builtins.c, which eventually
+// CIR shall be able to support fully.
+
+#include <immintrin.h>
+
+// FIXME: We should also lower the __builtin_ia32_pblendw128 (and similar)
+//  functions to this IR. In the future we could delete the corresponding
+//  intrinsic in LLVM if it's not being used anymore.
+__m256i test_mm256_blend_epi16(__m256i a, __m256i b) {
+  // CIR-LABEL: _mm256_blend_epi16
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x 
!s16i>) [#cir.int<0> : !s32i, #cir.int<17> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : 
!s32i, #cir.int<7> : !s32i, #cir.int<8> : !s32i, #cir.int<25> : !s32i, 
#cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<12> : !s32i, #cir.int<13> 
: !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i] : !cir.vector<16 x !s16i>
+
+  // LLVM-LABEL: test_mm256_blend_epi16
+  // LLVM-NOT: @llvm.x86.avx2.pblendw
+  // LLVM: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> 
<i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 
10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  // OGCG-LABEL: test_mm256_blend_epi16
+  // OGCG-NOT: @llvm.x86.avx2.pblendw
+  // OGCG: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> 
<i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 
10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  return _mm256_blend_epi16(a, b, 2);
+}
+
+__m128i test_mm_blend_epi32(__m128i a, __m128i b) {
+  // CIR-LABEL: _mm_blend_epi32
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) 
[#cir.int<4> : !s32i, #cir.int<1> : !s32i, #cir.int<6> : !s32i, #cir.int<3> : 
!s32i] : !cir.vector<4 x !s32i>
+
+  // LLVM-LABEL: test_mm_blend_epi32
+  // LLVM-NOT: @llvm.x86.avx2.pblendd.128
+  // LLVM: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 
4, i32 1, i32 6, i32 3>
+
+  // OGCG-LABEL: test_mm_blend_epi32
+  // OGCG-NOT: @llvm.x86.avx2.pblendd.128
+  // OGCG: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 
4, i32 1, i32 6, i32 3>
+  return _mm_blend_epi32(a, b, 0x05);
+}
+
+__m256i test_mm256_blend_epi32(__m256i a, __m256i b) {
+  // CIR-LABEL: _mm256_blend_epi32
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s32i>) 
[#cir.int<8> : !s32i, #cir.int<1> : !s32i, #cir.int<10> : !s32i, #cir.int<3> : 
!s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<6> : !s32i, 
#cir.int<7> : !s32i] : !cir.vector<8 x !s32i>
+
+  // LLVM-LABEL: test_mm256_blend_epi32
+  // LLVM-NOT: @llvm.x86.avx2.pblendd.256
+  // LLVM: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 
8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
+
+  // OGCG-LABEL: test_mm256_blend_epi32
+  // OGCG-NOT: @llvm.x86.avx2.pblendd.256
+  // OGCG: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 
8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
+  return _mm256_blend_epi32(a, b, 0x35);
+}
+
+__m256i test0_mm256_inserti128_si256(__m256i a, __m128i b) {
+
+  // CIR-LABEL: test0_mm256_inserti128_si256
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !s64i>) 
[#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : 
!s32i] : !cir.vector<4 x !s64i>
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s64i>) 
[#cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : 
!s32i] : !cir.vector<4 x !s64i>
+
+  // LLVM-LABEL: test0_mm256_inserti128_si256
+  // LLVM: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <4 x i32> <i32 
0, i32 1, i32 2, i32 3>
+  // LLVM: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 
4, i32 5, i32 2, i32 3>
+  return _mm256_inserti128_si256(a, b, 0);
+}
+
+__m256i test1_mm256_inserti128_si256(__m256i a, __m128i b) {
+  // CIR-LABEL: test1_mm256_inserti128_si256
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !s64i>) 
[#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : 
!s32i] : !cir.vector<4 x !s64i>
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s64i>) 
[#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : 
!s32i] : !cir.vector<4 x !s64i>
+
+  // LLVM-LABEL: test1_mm256_inserti128_si256
+  // LLVM: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <4 x i32> <i32 
0, i32 1, i32 2, i32 3>
+  // LLVM: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 
0, i32 1, i32 4, i32 5>
+  return _mm256_inserti128_si256(a, b, 1);
+}
+
+// Immediate should be truncated to one bit.
+__m256i test2_mm256_inserti128_si256(__m256i a, __m128i b) {
+  // CIR-LABEL: test2_mm256_inserti128_si256
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !s64i>) 
[#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : 
!s32i] : !cir.vector<4 x !s64i>
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s64i>) 
[#cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : 
!s32i] : !cir.vector<4 x !s64i>
+
+  // LLVM-LABEL: test2_mm256_inserti128_si256
+  // LLVM: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <4 x i32> <i32 
0, i32 1, i32 2, i32 3>
+  // LLVM: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 
4, i32 5, i32 2, i32 3>
+  return _mm256_inserti128_si256(a, b, 0);
+}
+
+__m256i test_mm256_shufflelo_epi16(__m256i a) {
+  // CIR-LABEL: _mm256_shufflelo_epi16
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x 
!s16i>) [#cir.int<3> : !s32i, #cir.int<0> : !s32i, #cir.int<1> : !s32i, 
#cir.int<1> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : 
!s32i, #cir.int<7> : !s32i, #cir.int<11> : !s32i, #cir.int<8> : !s32i, 
#cir.int<9> : !s32i, #cir.int<9> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : 
!s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i] : !cir.vector<16 x !s16i>
+
+  // LLVM-LABEL: test_mm256_shufflelo_epi16
+  // LLVM: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> 
<i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, 
i32 9, i32 12, i32 13, i32 14, i32 15>
+
+  // OGCG-LABEL: test_mm256_shufflelo_epi16
+  // OGCG: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> 
<i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, 
i32 9, i32 12, i32 13, i32 14, i32 15>
+  return _mm256_shufflelo_epi16(a, 83);
+}
+
+__m256i test_mm256_shufflehi_epi16(__m256i a) {
+  // CIR-LABEL: _mm256_shufflehi_epi16
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x 
!s16i>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i, #cir.int<7> : !s32i, #cir.int<6> : !s32i, #cir.int<6> : 
!s32i, #cir.int<5> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, 
#cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<15> : !s32i, #cir.int<14> 
: !s32i, #cir.int<14> : !s32i, #cir.int<13> : !s32i] : !cir.vector<16 x !s16i>
+
+  // LLVM-LABEL: test_mm256_shufflehi_epi16
+  // LLVM: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, 
i32 11, i32 15, i32 14, i32 14, i32 13>
+
+  // OGCG-LABEL: test_mm256_shufflehi_epi16
+  // OGCG: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, 
i32 11, i32 15, i32 14, i32 14, i32 13>
+  return _mm256_shufflehi_epi16(a, 107);
+}
+
+__m256i test_mm256_alignr_epi8(__m256i a, __m256i b) {
+  // CIR-LABEL: test_mm256_alignr_epi8
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<32 x 
{{!s8i|!u8i}}>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, 
#cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i, #cir.int<8> : 
!s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i, 
#cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<14> : !s32i, #cir.int<15> 
: !s32i, #cir.int<32> : !s32i, #cir.int<33> : !s32i, #cir.int<18> : !s32i, 
#cir.int<19> : !s32i, #cir.int<20> : !s32i, #cir.int<21> : !s32i, #cir.int<22> 
: !s32i, #cir.int<23> : !s32i, #cir.int<24> : !s32i, #cir.int<25> : !s32i, 
#cir.int<26> : !s32i, #cir.int<27> : !s32i, #cir.int<28> : !s32i, #cir.int<29> 
: !s32i, #cir.int<30> : !s32i, #cir.int<31> : !s32i, #cir.int<48> : !s32i, 
#cir.int<49> : !s32i] : !cir.vector<32 x {{!s8i|!u8i}}>
+
+  // LLVM-LABEL: test_mm256_alignr_epi8
+  // LLVM: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 
2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 
13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 
23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 
49>
+
+  // OGCG-LABEL: test_mm256_alignr_epi8
+  // OGCG: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 
2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 
13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 
23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 
49>
+  return _mm256_alignr_epi8(a, b, 2);
+}
diff --git a/clang/test/CIR/CodeGen/X86/avx512bw-builtins.c 
b/clang/test/CIR/CodeGen/X86/avx512bw-builtins.c
new file mode 100644
index 0000000000000..cae13feb22abe
--- /dev/null
+++ b/clang/test/CIR/CodeGen/X86/avx512bw-builtins.c
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512bw -fclangir -emit-cir -o 
%t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512bw -fno-signed-char  
-fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512bw  -fclangir -emit-llvm -o 
%t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux  -target-feature +avx512bw -fno-signed-char  
-fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512bw -emit-llvm -o - -Wall 
-Werror -Wsign-conversion | FileCheck %s --check-prefix=OGCG
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512bw -fno-signed-char 
-emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s 
--check-prefix=OGCG
+
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +avx512bw -emit-llvm -o - -Wall 
-Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +avx512bw -fno-signed-char 
-emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s 
--check-prefixes=OGCG
+
+#include <immintrin.h>
+
+__m512i test_mm512_shufflelo_epi16(__m512i __A) {
+  // CIR-LABEL: _mm512_shufflelo_epi16
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<32 x 
!s16i>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<0> : !s32i, 
#cir.int<0> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : 
!s32i, #cir.int<7> : !s32i, #cir.int<9> : !s32i, #cir.int<9> : !s32i, 
#cir.int<8> : !s32i, #cir.int<8> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : 
!s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i, #cir.int<17> : !s32i, 
#cir.int<17> : !s32i, #cir.int<16> : !s32i, #cir.int<16> : !s32i, #cir.int<20> 
: !s32i, #cir.int<21> : !s32i, #cir.int<22> : !s32i, #cir.int<23> : !s32i, 
#cir.int<25> : !s32i, #cir.int<25> : !s32i, #cir.int<24> : !s32i, #cir.int<24> 
: !s32i, #cir.int<28> : !s32i, #cir.int<29> : !s32i, #cir.int<30> : !s32i, 
#cir.int<31> : !s32i] : !cir.vector<32 x !s16i>
+
+  // LLVM-LABEL: @test_mm512_shufflelo_epi16
+  // LLVM: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> 
<i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, 
i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, 
i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, 
i32 31>
+
+  // OGCG-LABEL: @test_mm512_shufflelo_epi16
+  // OGCG: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> 
<i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, 
i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, 
i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, 
i32 31>
+  return _mm512_shufflelo_epi16(__A, 5);
+}
+
+__m512i test_mm512_shufflehi_epi16(__m512i __A) {
+  // CIR-LABEL: _mm512_shufflehi_epi16
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<32 x 
!s16i>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<5> : !s32i, #cir.int<4> : 
!s32i, #cir.int<4> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, 
#cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<13> : !s32i, #cir.int<13> 
: !s32i, #cir.int<12> : !s32i, #cir.int<12> : !s32i, #cir.int<16> : !s32i, 
#cir.int<17> : !s32i, #cir.int<18> : !s32i, #cir.int<19> : !s32i, #cir.int<21> 
: !s32i, #cir.int<21> : !s32i, #cir.int<20> : !s32i, #cir.int<20> : !s32i, 
#cir.int<24> : !s32i, #cir.int<25> : !s32i, #cir.int<26> : !s32i, #cir.int<27> 
: !s32i, #cir.int<29> : !s32i, #cir.int<29> : !s32i, #cir.int<28> : !s32i, 
#cir.int<28> : !s32i] : !cir.vector<32 x !s16i>
+
+  // LLVM-LABEL: @test_mm512_shufflehi_epi16
+  // LLVM: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, 
i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, 
i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, 
i32 28>
+
+  // OGCG-LABEL: @test_mm512_shufflehi_epi16
+  // OGCG: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, 
i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, 
i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, 
i32 28>
+  return _mm512_shufflehi_epi16(__A, 5);
+}
+
+__m512i test_mm512_alignr_epi8(__m512i __A,__m512i __B){
+  // CIR-LABEL: _mm512_alignr_epi8
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<64 x 
{{!s8i|!u8i}}>) [#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, 
#cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i, #cir.int<8> : 
!s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i, 
#cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<14> : !s32i, #cir.int<15> 
: !s32i, #cir.int<64> : !s32i, #cir.int<65> : !s32i, #cir.int<18> : !s32i, 
#cir.int<19> : !s32i, #cir.int<20> : !s32i, #cir.int<21> : !s32i, #cir.int<22> 
: !s32i, #cir.int<23> : !s32i, #cir.int<24> : !s32i, #cir.int<25> : !s32i, 
#cir.int<26> : !s32i, #cir.int<27> : !s32i, #cir.int<28> : !s32i, #cir.int<29> 
: !s32i, #cir.int<30> : !s32i, #cir.int<31> : !s32i, #cir.int<80> : !s32i, 
#cir.int<81> : !s32i, #cir.int<34> : !s32i, #cir.int<35> : !s32i, #cir.int<36> 
: !s32i, #cir.int<37> : !s32i, #cir.int<38> : !s32i, #cir.int<39> : !s32i, 
#cir.int<40> : !s32i, #cir.int<41> : !s32i, #cir.int<42> : !s32i, #cir.int<43> 
: !s32i, #cir.int<44> : !s32i, #cir.int<45> : !s32i, #cir.int<46> : !s32i, 
#cir.int<47> : !s32i, #cir.int<96> : !s32i, #cir.int<97> : !s32i, #cir.int<50> 
: !s32i, #cir.int<51> : !s32i, #cir.int<52> : !s32i, #cir.int<53> : !s32i, 
#cir.int<54> : !s32i, #cir.int<55> : !s32i, #cir.int<56> : !s32i, #cir.int<57> 
: !s32i, #cir.int<58> : !s32i, #cir.int<59> : !s32i, #cir.int<60> : !s32i, 
#cir.int<61> : !s32i, #cir.int<62> : !s32i, #cir.int<63> : !s32i, #cir.int<112> 
: !s32i, #cir.int<113> : !s32i] : !cir.vector<64 x {{!s8i|!u8i}}>
+
+  // LLVM-LABEL: @test_mm512_alignr_epi8
+  // LLVM: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 
2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 
13, i32 14, i32 15, i32 64, i32 65, i32 18, i32 19, i32 20, i32 21, i32 22, i32 
23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 
81, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 
43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 50, i32 51, i32 52, i32 
53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 
63, i32 112, i32 113>
+
+  // OGCG-LABEL: @test_mm512_alignr_epi8
+  // OGCG: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 
2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 
13, i32 14, i32 15, i32 64, i32 65, i32 18, i32 19, i32 20, i32 21, i32 22, i32 
23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 
81, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 
43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 50, i32 51, i32 52, i32 
53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 
63, i32 112, i32 113>
+  return _mm512_alignr_epi8(__A, __B, 2);
+}
diff --git a/clang/test/CIR/CodeGen/X86/avx512dq-builtins.c 
b/clang/test/CIR/CodeGen/X86/avx512dq-builtins.c
new file mode 100644
index 0000000000000..f7dac1c0c8045
--- /dev/null
+++ b/clang/test/CIR/CodeGen/X86/avx512dq-builtins.c
@@ -0,0 +1,55 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512dq -fclangir -emit-cir -o 
%t.cir -Wall -Werror
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512dq -fclangir -emit-llvm -o 
%t.ll -Wall -Werror
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +avx512dq -emit-llvm -o - -Wall 
-Werror | FileCheck %s --check-prefix=OGCG
+
+#include <immintrin.h>
+
+__m512 test_mm512_insertf32x8(__m512 __A, __m256 __B) {
+  // CIR-LABEL: test_mm512_insertf32x8
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x 
!cir.float>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : 
!s32i, #cir.int<7> : !s32i, #cir.int<16> : !s32i, #cir.int<17> : !s32i, 
#cir.int<18> : !s32i, #cir.int<19> : !s32i, #cir.int<20> : !s32i, #cir.int<21> 
: !s32i, #cir.int<22> : !s32i, #cir.int<23> : !s32i] : !cir.vector<16 x 
!cir.float>
+
+  // LLVM-LABEL: @test_mm512_insertf32x8
+  // LLVM: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x 
i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, 
i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+
+  // OGCG-LABEL: @test_mm512_insertf32x8
+  // OGCG: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x 
i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, 
i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  return _mm512_insertf32x8(__A, __B, 1);
+}
+
+__m512i test_mm512_inserti32x8(__m512i __A, __m256i __B) {
+  // CIR-LABEL: test_mm512_inserti32x8
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x 
!s32i>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : 
!s32i, #cir.int<7> : !s32i, #cir.int<16> : !s32i, #cir.int<17> : !s32i, 
#cir.int<18> : !s32i, #cir.int<19> : !s32i, #cir.int<20> : !s32i, #cir.int<21> 
: !s32i, #cir.int<22> : !s32i, #cir.int<23> : !s32i] : !cir.vector<16 x !s32i>
+
+  // LLVM-LABEL: @test_mm512_inserti32x8
+  // LLVM: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 
18, i32 19, i32 20, i32 21, i32 22, i32 23>
+
+  // OGCG-LABEL: @test_mm512_inserti32x8
+  // OGCG: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 
18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  return _mm512_inserti32x8(__A, __B, 1);
+}
+
+__m512d test_mm512_insertf64x2(__m512d __A, __m128d __B) {
+  // CIR-LABEL: test_mm512_insertf64x2
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.double>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<8> : 
!s32i, #cir.int<9> : !s32i] : !cir.vector<8 x !cir.double>
+
+  // LLVM-LABEL: @test_mm512_insertf64x2
+  // LLVM: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+
+  // OGCG-LABEL: @test_mm512_insertf64x2
+  // OGCG: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+  return _mm512_insertf64x2(__A, __B, 3);
+}
+
+__m512i test_mm512_inserti64x2(__m512i __A, __m128i __B) {
+  // CIR-LABEL: test_mm512_inserti64x2
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s64i>) 
[#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : 
!s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, 
#cir.int<7> : !s32i] : !cir.vector<8 x !s64i>
+
+  // LLVM-LABEL: @test_mm512_inserti64x2
+  // LLVM: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 
0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+
+  // OGCG-LABEL: @test_mm512_inserti64x2
+  // OGCG: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 
0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+  return _mm512_inserti64x2(__A, __B, 1);
+}
diff --git a/clang/test/CIR/CodeGen/X86/avx512f-builtins.c 
b/clang/test/CIR/CodeGen/X86/avx512f-builtins.c
index dc54a87856a7c..9568a74ae1ba9 100644
--- a/clang/test/CIR/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CIR/CodeGen/X86/avx512f-builtins.c
@@ -77,3 +77,72 @@ __m512i test_mm512_undefined_epi32(void) {
   // OGCG: ret <8 x i64> zeroinitializer
   return _mm512_undefined_epi32();
 }
+
+__m512d test_mm512_insertf64x4(__m512d __A, __m256d __B) {
+  // CIR-LABEL: test_mm512_insertf64x4
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.double>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : 
!s32i, #cir.int<11> : !s32i] : !cir.vector<8 x !cir.double>
+
+  // LLVM-LABEL: test_mm512_insertf64x4
+  // LLVM: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+
+  // OGCG-LABEL: test_mm512_insertf64x4
+  // OGCG: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  return _mm512_insertf64x4(__A, __B, 1);
+}
+
+__m512 test_mm512_insertf32x4(__m512 __A, __m128 __B) {
+  // CIR-LABEL: test_mm512_insertf32x4
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x 
!cir.float>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i, #cir.int<16> : !s32i, #cir.int<17> : !s32i, #cir.int<18> : 
!s32i, #cir.int<19> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, 
#cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<12> : !s32i, #cir.int<13> 
: !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i] : !cir.vector<16 x 
!cir.float>
+
+  // LLVM-LABEL: test_mm512_insertf32x4
+  // LLVM: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x 
i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, 
i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  // OGCG-LABEL: test_mm512_insertf32x4
+  // OGCG: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x 
i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, 
i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  return _mm512_insertf32x4(__A, __B, 1);
+}
+
+__m512i test_mm512_inserti64x4(__m512i __A, __m256i __B) {
+  // CIR-LABEL: test_mm512_inserti64x4
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s64i>) 
[#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : 
!s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, 
#cir.int<11> : !s32i] : !cir.vector<8 x !s64i>
+
+  // LLVM-LABEL: test_mm512_inserti64x4
+  // LLVM: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 
0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  return _mm512_inserti64x4(__A, __B, 1);
+}
+
+__m512i test_mm512_inserti32x4(__m512i __A, __m128i __B) {
+  // CIR-LABEL: test_mm512_inserti32x4
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x 
!s32i>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i, #cir.int<16> : !s32i, #cir.int<17> : !s32i, #cir.int<18> : 
!s32i, #cir.int<19> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, 
#cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<12> : !s32i, #cir.int<13> 
: !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i] : !cir.vector<16 x !s32i>
+
+  // LLVM-LABEL: test_mm512_inserti32x4
+  // LLVM: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 
10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  // OGCG-LABEL: test_mm512_inserti32x4
+  // OGCG: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 
10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  return _mm512_inserti32x4(__A, __B, 1);
+}
+
+__m512d test_mm512_shuffle_pd(__m512d __M, __m512d __V) {
+  // CIR-LABEL: test_mm512_shuffle_pd
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.double>) [#cir.int<0> : !s32i, #cir.int<8> : !s32i, #cir.int<3> : !s32i, 
#cir.int<10> : !s32i, #cir.int<4> : !s32i, #cir.int<12> : !s32i, #cir.int<6> : 
!s32i, #cir.int<14> : !s32i] : !cir.vector<8 x !cir.double>
+
+  // LLVM-LABEL: test_mm512_shuffle_pd
+  // LLVM: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> 
<i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
+
+  // OGCG-LABEL: test_mm512_shuffle_pd
+  // OGCG: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> 
<i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
+  return _mm512_shuffle_pd(__M, __V, 4);
+}
+
+__m512 test_mm512_shuffle_ps(__m512 __M, __m512 __V) {
+  // CIR-LABEL: test_mm512_shuffle_ps
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x 
!cir.float>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<16> : !s32i, 
#cir.int<16> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<20> : 
!s32i, #cir.int<20> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, 
#cir.int<24> : !s32i, #cir.int<24> : !s32i, #cir.int<12> : !s32i, #cir.int<13> 
: !s32i, #cir.int<28> : !s32i, #cir.int<28> : !s32i] : !cir.vector<16 x 
!cir.float>
+
+  // LLVM-LABEL: test_mm512_shuffle_ps
+  // LLVM: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x 
i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, 
i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
+
+  // OGCG-LABEL: test_mm512_shuffle_ps
+  // OGCG: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x 
i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, 
i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
+  return _mm512_shuffle_ps(__M, __V, 4);
+}
\ No newline at end of file
diff --git a/clang/test/CIR/CodeGen/X86/avx512vl-builtins.c 
b/clang/test/CIR/CodeGen/X86/avx512vl-builtins.c
new file mode 100644
index 0000000000000..99d8fce371d0e
--- /dev/null
+++ b/clang/test/CIR/CodeGen/X86/avx512vl-builtins.c
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512f -target-feature +avx512vl 
-fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512f -target-feature +avx512vl 
-fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+#include <immintrin.h>
+
+__m256 test_mm256_insertf32x4(__m256 __A, __m128 __B) {
+  // CIR-LABEL: test_mm256_insertf32x4
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.float>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, 
#cir.int<3> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : 
!s32i, #cir.int<11> : !s32i] : !cir.vector<8 x !cir.float>
+
+  // LLVM-LABEL: @test_mm256_insertf32x4
+  // LLVM: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> 
<i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  return _mm256_insertf32x4(__A, __B, 1);
+}
+
+__m256i test_mm256_inserti32x4(__m256i __A, __m128i __B) {
+  // CIR-LABEL: test_mm256_inserti32x4
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s32i>) 
[#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : 
!s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, 
#cir.int<11> : !s32i] : !cir.vector<8 x !s32i>
+
+  // LLVM-LABEL: @test_mm256_inserti32x4
+  // LLVM: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 
0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  return _mm256_inserti32x4(__A, __B, 1);
+}
diff --git a/clang/test/CIR/CodeGen/X86/avx512vldq-builtins.c 
b/clang/test/CIR/CodeGen/X86/avx512vldq-builtins.c
new file mode 100644
index 0000000000000..da80514247d5b
--- /dev/null
+++ b/clang/test/CIR/CodeGen/X86/avx512vldq-builtins.c
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512dq -target-feature 
+avx512vl -fclangir -emit-cir -o %t.cir -Wall -Werror
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512dq -target-feature 
+avx512vl -fclangir -emit-llvm -o %t.ll -Wall -Werror
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +avx512dq -target-feature +avx512vl 
-emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
+
+#include <immintrin.h>
+
+__m256d test_mm256_insertf64x2(__m256d __A, __m128d __B) {
+  // CIR-LABEL: test_mm256_insertf64x2
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.double>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<4> : !s32i, 
#cir.int<5> : !s32i] : !cir.vector<4 x !cir.double>
+
+  // LLVM-LABEL: @test_mm256_insertf64x2
+  // LLVM: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
<i32 0, i32 1, i32 4, i32 5>
+
+  // OGCG-LABEL: @test_mm256_insertf64x2
+  // OGCG: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> 
<i32 0, i32 1, i32 4, i32 5>
+  return _mm256_insertf64x2(__A, __B, 1);
+}
+
+__m256i test_mm256_inserti64x2(__m256i __A, __m128i __B) {
+  // CIR-LABEL: test_mm256_inserti64x2
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s64i>) 
[#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : 
!s32i] : !cir.vector<4 x !s64i>
+
+  // LLVM-LABEL: @test_mm256_inserti64x2
+  // LLVM: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 
0, i32 1, i32 4, i32 5>
+
+  // OGCG-LABEL: @test_mm256_inserti64x2
+  // OGCG: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 
0, i32 1, i32 4, i32 5>
+  return _mm256_inserti64x2(__A, __B, 1);
+}
\ No newline at end of file
diff --git a/clang/test/CIR/CodeGen/X86/palignr.c 
b/clang/test/CIR/CodeGen/X86/palignr.c
new file mode 100644
index 0000000000000..6da4535c08429
--- /dev/null
+++ b/clang/test/CIR/CodeGen/X86/palignr.c
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 %s -triple=x86_64-unknown-linux -target-feature +ssse3 
-fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=CIR
+
+// RUN: %clang_cc1 %s -triple=x86_64-unknown-linux -target-feature +ssse3 
-emit-llvm -o %t_og.ll
+// RUN: FileCheck --input-file=%t_og.ll %s --check-prefix=OGCG
+
+#define _mm_alignr_epi8(a, b, n) (__builtin_ia32_palignr128((a), (b), (n)))
+typedef __attribute__((vector_size(16))) int int4;
+
+// CIR-LABEL: @align1
+// CIR: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 
15, i32 16, i32 17
+// OGCG-LABEL: @align1
+// OGCG: %palignr = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x 
i32> <i32 15, i32 16, i32 17
+int4 align1(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 15); }
+
+// CIR-LABEL: @align2
+// CIR: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 
16, i32 17, i32 18
+// OGCG-LABEL: @align2
+// OGCG: %palignr = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x 
i32> <i32 16, i32 17, i32 18
+int4 align2(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 16); }
+
+// CIR-LABEL: @align3
+// CIR: shufflevector <16 x i8> %{{.*}}, <16 x i8> zeroinitializer, <16 x i32> 
<i32 1, i32 2, i32 3
+// OGCG-LABEL: @align3
+// OGCG: %palignr = shufflevector <16 x i8> %{{.*}}, <16 x i8> 
zeroinitializer, <16 x i32> <i32 1, i32 2, i32 3
+int4 align3(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 17); }
+
+// CIR-LABEL: @align4
+// CIR: store <4 x i32> zeroinitializer, ptr %{{.*}}, align 16
+// OGCG-LABEL: @align4
+// OGCG: ret <4 x i32> zeroinitializer
+int4 align4(int4 a, int4 b) { return _mm_alignr_epi8(a, b, 32); }
diff --git a/clang/test/CIR/CodeGen/X86/sse-builtins.c 
b/clang/test/CIR/CodeGen/X86/sse-builtins.c
index c893859b297cc..a2a5b1849d727 100644
--- a/clang/test/CIR/CodeGen/X86/sse-builtins.c
+++ b/clang/test/CIR/CodeGen/X86/sse-builtins.c
@@ -71,3 +71,15 @@ __m128 test_mm_undefined_ps(void) {
   // OGCG: ret <4 x float> zeroinitializer
   return _mm_undefined_ps();
 }
+
+__m128 test_mm_shuffle_ps(__m128 A, __m128 B) {
+  // CIR-LABEL: _mm_shuffle_ps
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.float>) [#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<4> : !s32i, 
#cir.int<4> : !s32i] : !cir.vector<4 x !cir.float>
+
+  // CHECK-LABEL: test_mm_shuffle_ps
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> 
<i32 0, i32 0, i32 4, i32 4>
+
+  // OGCG-LABEL: test_mm_shuffle_ps
+  // OGCG: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> 
<i32 0, i32 0, i32 4, i32 4>
+  return _mm_shuffle_ps(A, B, 0);
+}
diff --git a/clang/test/CIR/CodeGen/X86/sse2-builtins.c 
b/clang/test/CIR/CodeGen/X86/sse2-builtins.c
index f5e07cdc28ccd..31a297bd3cb52 100644
--- a/clang/test/CIR/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CIR/CodeGen/X86/sse2-builtins.c
@@ -8,8 +8,11 @@
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +sse2 -fno-signed-char -fclangir 
-emit-llvm -o %t.ll -Wall -Werror
 // RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
 
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +sse -emit-llvm -o - -Wall -Werror 
| FileCheck %s -check-prefix=OGCG
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +sse -emit-llvm -o - -Wall -Werror 
| FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +sse2 -emit-llvm -o - -Wall -Werror 
| FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +sse2 -fno-signed-char -emit-llvm 
-o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +sse2 -emit-llvm -o - -Wall -Werror 
| FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +sse2 -fno-signed-char -emit-llvm 
-o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
 
 // This test mimics clang/test/CodeGen/X86/sse2-builtins.c, which eventually
 // CIR shall be able to support fully.
@@ -108,3 +111,39 @@ void test_mm_pause(void) {
   // LLVM: call void @llvm.x86.sse2.pause()
   // OGCG: call void @llvm.x86.sse2.pause()
 }
+
+__m128i test_mm_shufflelo_epi16(__m128i A) {
+  // CIR-LABEL: _mm_shufflelo_epi16
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s16i>) 
[#cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : 
!s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, 
#cir.int<7> : !s32i] : !cir.vector<8 x !s16i>
+
+  // LLVM-LABEL: test_mm_shufflelo_epi16
+  // LLVM: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 
0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+
+  // OGCG-LABEL: test_mm_shufflelo_epi16
+  // OGCG: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 
0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+  return _mm_shufflelo_epi16(A, 0);
+}
+
+__m128i test_mm_shufflehi_epi16(__m128i A) {
+  // CIR-LABEL: _mm_shufflehi_epi16
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s16i>) 
[#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : 
!s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i, #cir.int<4> : !s32i, 
#cir.int<4> : !s32i] : !cir.vector<8 x !s16i>
+
+  // LLVM-LABEL: test_mm_shufflehi_epi16
+  // LLVM: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 
0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+
+  // OGCG-LABEL: test_mm_shufflehi_epi16
+  // OGCG: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 
0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+  return _mm_shufflehi_epi16(A, 0);
+}
+
+__m128d test_mm_shuffle_pd(__m128d A, __m128d B) {
+  // CIR-LABEL: test_mm_shuffle_pd
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x 
!cir.double>) [#cir.int<1> : !s32i, #cir.int<2> : !s32i] : !cir.vector<2 x 
!cir.double>
+
+  // CHECK-LABEL: test_mm_shuffle_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x 
i32> <i32 1, i32 2>
+
+  // OGCG-LABEL: test_mm_shuffle_pd
+  // OGCG: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> 
<i32 1, i32 2>
+  return _mm_shuffle_pd(A, B, 1);
+}
\ No newline at end of file
diff --git a/clang/test/CIR/CodeGen/X86/sse3-builtins.c 
b/clang/test/CIR/CodeGen/X86/sse3-builtins.c
new file mode 100644
index 0000000000000..2072cc160988d
--- /dev/null
+++ b/clang/test/CIR/CodeGen/X86/sse3-builtins.c
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +ssse3 -fclangir -emit-cir -o 
%t.cir -Wall -Werror
+// RUN: FileCheck --check-prefixes=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +ssse3 -fclangir -emit-llvm -o 
%t.ll -Wall -Werror
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall 
-Werror | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall 
-Werror | FileCheck %s --check-prefixes=OGCG
+
+#include <immintrin.h>
+
+__m128i test_mm_alignr_epi8(__m128i a, __m128i b) {
+  // CIR-LABEL: _mm_alignr_epi8
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !s8i>) 
[#cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : 
!s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i, #cir.int<8> : !s32i, 
#cir.int<9> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<12> : 
!s32i, #cir.int<13> : !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i, 
#cir.int<16> : !s32i, #cir.int<17> : !s32i] : !cir.vector<16 x !s8i>
+
+  // LLVM-LABEL: test_mm_alignr_epi8
+  // LLVM: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 
2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 
13, i32 14, i32 15, i32 16, i32 17>
+
+  // OGCG-LABEL: test_mm_alignr_epi8
+  // OGCG: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 
2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 
13, i32 14, i32 15, i32 16, i32 17>
+  return _mm_alignr_epi8(a, b, 2);
+}
\ No newline at end of file
diff --git a/clang/test/CIR/CodeGen/X86/sse41-builtins.c 
b/clang/test/CIR/CodeGen/X86/sse41-builtins.c
new file mode 100644
index 0000000000000..462ead5789f88
--- /dev/null
+++ b/clang/test/CIR/CodeGen/X86/sse41-builtins.c
@@ -0,0 +1,55 @@
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-cir -o 
%t.cir -Wall -Werror
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +sse4.1 -fno-signed-char -fclangir 
-emit-cir -o %t.cir -Wall -Werror
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-llvm -o 
%t.ll -Wall -Werror
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +sse4.1 -fno-signed-char -fclangir 
-emit-llvm -o %t.ll -Wall -Werror
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall 
-Werror | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm 
-o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall 
-Werror | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm 
-o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
+
+// This test mimics clang/test/CodeGen/X86/sse41-builtins.c, which eventually
+// CIR shall be able to support fully.
+
+#include <immintrin.h>
+
+__m128i test_mm_blend_epi16(__m128i V1, __m128i V2) {
+  // CIR-LABEL: test_mm_blend_epi16
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s16i>) 
[#cir.int<0> : !s32i, #cir.int<9> : !s32i, #cir.int<2> : !s32i, #cir.int<11> : 
!s32i, #cir.int<4> : !s32i, #cir.int<13> : !s32i, #cir.int<6> : !s32i, 
#cir.int<7> : !s32i] : !cir.vector<8 x !s16i>
+
+  // LLVM-LABEL: test_mm_blend_epi16
+  // LLVM: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 
0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 7>
+
+  // OGCG-LABEL: test_mm_blend_epi16
+  // OGCG: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 
0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 7>
+  return _mm_blend_epi16(V1, V2, 42);
+}
+
+__m128d test_mm_blend_pd(__m128d V1, __m128d V2) {
+  // CIR-LABEL: test_mm_blend_pd
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x 
!cir.double>) [#cir.int<0> : !s32i, #cir.int<3> : !s32i] : !cir.vector<2 x 
!cir.double>
+
+  // LLVM-LABEL: test_mm_blend_pd
+  // LLVM: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> 
<i32 0, i32 3>
+
+  // OGCG-LABEL: test_mm_blend_pd
+  // OGCG: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> 
<i32 0, i32 3>
+  return _mm_blend_pd(V1, V2, 2);
+}
+
+__m128 test_mm_blend_ps(__m128 V1, __m128 V2) {
+  // CIR-LABEL: test_mm_blend_ps
+  // CIR: %{{.*}} = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.float>) [#cir.int<0> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, 
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.float>
+
+  // LLVM-LABEL: test_mm_blend_ps
+  // LLVM: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> 
<i32 0, i32 5, i32 6, i32 3>
+
+  // OGCG-LABEL: test_mm_blend_ps
+  // OGCG: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> 
<i32 0, i32 5, i32 6, i32 3>
+  return _mm_blend_ps(V1, V2, 6);
+}

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [CIR] Upstream vec shuffle builtins in CIR codegen (PR #169178)

Reply via email to