[clang] [llvm] [CIR] Support x86 builtin rotate (PR #169566)

Omar Hossam via cfe-commits Fri, 28 Nov 2025 22:48:32 -0800

https://github.com/moar55 updated 
https://github.com/llvm/llvm-project/pull/169566


>From cc24b4efe622a5d1b081aaa5d87dd029cea4940d Mon Sep 17 00:00:00 2001
From: Omar Ibrahim <[email protected]>
Date: Thu, 20 Nov 2025 23:12:40 +0100
Subject: [PATCH 01/10] [CIR] Implement x86 rotate builtins

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 28 +++++++++++++++++++---
 shell.nix                                  | 14 +++++++++++
 2 files changed, 39 insertions(+), 3 deletions(-)
 create mode 100644 shell.nix

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 0e43345bad6f1..1070165e04276 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -85,6 +85,25 @@ static mlir::Value getMaskVecValue(CIRGenBuilderTy &builder, 
mlir::Location loc,
   return maskVec;
 }
 
+static mlir::Value emitX86FunnelShift(CIRGenFunction &cgf, const CallExpr *e,
+                                      mlir::Value &op0, mlir::Value &op1,
+                                      mlir::Value &amt, bool isRight) {
+  auto ty = op0.getType();
+
+  // Amount may be scalar immediate, in which case create a splat vector.
+  // Funnel shifts amounts are treated as modulo and types are all power-of-2
+  // so we only care about the lowest log2 bits anyway.
+  if (amt.getType() != ty) {
+    amt = cgf.getBuilder().createIntCast(
+        amt, mlir::cast<cir::VectorType>(ty).getElementType());
+    amt = cir::VecSplatOp::create(cgf.getBuilder(), 
cgf.getLoc(e->getExprLoc()),
+                                  ty, amt);
+  }
+
+  const std::string intrinsicName = isRight ? "fshr" : "fshl";
+  return emitIntrinsicCallOp(cgf, e, intrinsicName, ty, op0, op1, amt);
+}
+
 mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
                                                const CallExpr *expr) {
   if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -104,14 +123,15 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   // evaluation.
   assert(!cir::MissingFeatures::msvcBuiltins());
 
-  // Find out if any arguments are required to be integer constant expressions.
+  // Find out if any arguments are required to be integer constant
+  // expressions.
   assert(!cir::MissingFeatures::handleBuiltinICEArguments());
 
   // The operands of the builtin call
   llvm::SmallVector<mlir::Value> ops;
 
-  // `ICEArguments` is a bitmap indicating whether the argument at the i-th bit
-  // is required to be a constant integer expression.
+  // `ICEArguments` is a bitmap indicating whether the argument at the i-th
+  // bit is required to be a constant integer expression.
   unsigned iceArguments = 0;
   ASTContext::GetBuiltinTypeError error;
   getContext().GetBuiltinType(builtinID, error, &iceArguments);
@@ -661,12 +681,14 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_prolq128:
   case X86::BI__builtin_ia32_prolq256:
   case X86::BI__builtin_ia32_prolq512:
+    return emitX86FunnelShift(*this, e, ops[0], ops[1], ops[1], false);
   case X86::BI__builtin_ia32_prord128:
   case X86::BI__builtin_ia32_prord256:
   case X86::BI__builtin_ia32_prord512:
   case X86::BI__builtin_ia32_prorq128:
   case X86::BI__builtin_ia32_prorq256:
   case X86::BI__builtin_ia32_prorq512:
+    return emitX86FunnelShift(*this, e, ops[0], ops[1], ops[1], true);
   case X86::BI__builtin_ia32_selectb_128:
   case X86::BI__builtin_ia32_selectb_256:
   case X86::BI__builtin_ia32_selectb_512:
diff --git a/shell.nix b/shell.nix
new file mode 100644
index 0000000000000..c30f6dc7b6928
--- /dev/null
+++ b/shell.nix
@@ -0,0 +1,14 @@
+let
+  nixpkgs = fetchTarball 
"https://github.com/NixOS/nixpkgs/tarball/nixos-24.05";;
+  pkgs = import nixpkgs { config = {}; overlays = []; };
+in
+
+
+pkgs.mkShellNoCC {
+  packages = with pkgs; [
+    cmake
+    ninja
+    llvmPackages_latest.llvm
+  ];
+stdenv = pkgs.clangStdenv;
+}

>From ecbb71826710961d0fa60e2ddd5f3114171fd13b Mon Sep 17 00:00:00 2001
From: Omar Ibrahim <[email protected]>
Date: Sun, 23 Nov 2025 15:36:45 +0100
Subject: [PATCH 02/10] update and add test

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    |  17 +-
 .../X86/builtin_test_helpers.h                | 304 ++++++++++++++++++
 .../CIR/CodeGenBuiltins/X86/xop-builtin.c     |  82 +++++
 3 files changed, 397 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/CIR/CodeGenBuiltins/X86/builtin_test_helpers.h
 create mode 100644 clang/test/CIR/CodeGenBuiltins/X86/xop-builtin.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 1070165e04276..e844c41a80577 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -13,6 +13,8 @@
 
 #include "CIRGenFunction.h"
 #include "CIRGenModule.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/ValueRange.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/CIR/MissingFeatures.h"
@@ -85,7 +87,8 @@ static mlir::Value getMaskVecValue(CIRGenBuilderTy &builder, 
mlir::Location loc,
   return maskVec;
 }
 
-static mlir::Value emitX86FunnelShift(CIRGenFunction &cgf, const CallExpr *e,
+static mlir::Value emitX86FunnelShift(CIRGenFunction &cgf,
+                                      const mlir::Location &location,
                                       mlir::Value &op0, mlir::Value &op1,
                                       mlir::Value &amt, bool isRight) {
   auto ty = op0.getType();
@@ -96,12 +99,12 @@ static mlir::Value emitX86FunnelShift(CIRGenFunction &cgf, 
const CallExpr *e,
   if (amt.getType() != ty) {
     amt = cgf.getBuilder().createIntCast(
         amt, mlir::cast<cir::VectorType>(ty).getElementType());
-    amt = cir::VecSplatOp::create(cgf.getBuilder(), 
cgf.getLoc(e->getExprLoc()),
-                                  ty, amt);
+    amt = cir::VecSplatOp::create(cgf.getBuilder(), location, ty, amt);
   }
 
   const std::string intrinsicName = isRight ? "fshr" : "fshl";
-  return emitIntrinsicCallOp(cgf, e, intrinsicName, ty, op0, op1, amt);
+  return emitIntrinsicCallOp(cgf.getBuilder(), location, intrinsicName, ty,
+                             mlir::ValueRange{op0, op1, amt});
 }
 
 mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
@@ -681,14 +684,16 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_prolq128:
   case X86::BI__builtin_ia32_prolq256:
   case X86::BI__builtin_ia32_prolq512:
-    return emitX86FunnelShift(*this, e, ops[0], ops[1], ops[1], false);
+    return emitX86FunnelShift(*this, getLoc(expr->getExprLoc()), ops[0], 
ops[1],
+                              ops[1], false);
   case X86::BI__builtin_ia32_prord128:
   case X86::BI__builtin_ia32_prord256:
   case X86::BI__builtin_ia32_prord512:
   case X86::BI__builtin_ia32_prorq128:
   case X86::BI__builtin_ia32_prorq256:
   case X86::BI__builtin_ia32_prorq512:
-    return emitX86FunnelShift(*this, e, ops[0], ops[1], ops[1], true);
+    return emitX86FunnelShift(*this, getLoc(expr->getExprLoc()), ops[0], 
ops[1],
+                              ops[1], true);
   case X86::BI__builtin_ia32_selectb_128:
   case X86::BI__builtin_ia32_selectb_256:
   case X86::BI__builtin_ia32_selectb_512:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/builtin_test_helpers.h 
b/clang/test/CIR/CodeGenBuiltins/X86/builtin_test_helpers.h
new file mode 100644
index 0000000000000..fcaf360626a2d
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/builtin_test_helpers.h
@@ -0,0 +1,304 @@
+/* Helper methods for builtin intrinsic tests */
+
+#include <immintrin.h>
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+
+constexpr bool match_m64(__m64 _v, unsigned long long a) {
+  __v1du v = (__v1du)_v;
+  return v[0] == a;
+}
+
+constexpr bool match_v1di(__m64 v, long long a) {
+  return v[0] == a;
+}
+
+constexpr bool match_v1du(__m64 _v, unsigned long long a) {
+  __v1du v = (__v1du)_v;
+  return v[0] == a;
+}
+
+constexpr bool match_v2si(__m64 _v, int a, int b) {
+  __v2si v = (__v2si)_v;
+  return v[0] == a && v[1] == b;
+}
+
+constexpr bool match_v2su(__m64 _v, unsigned a, unsigned b) {
+  __v2su v = (__v2su)_v;
+  return v[0] == a && v[1] == b;
+}
+
+constexpr bool match_v4hi(__m64 _v, short a, short b, short c, short d) {
+  __v4hi v = (__v4hi)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d;
+}
+
+constexpr bool match_v4hu(__m64 _v, unsigned short a, unsigned short b, 
unsigned short c, unsigned short d) {
+  __v4hu v = (__v4hu)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d;
+}
+
+constexpr bool match_v8qi(__m64 _v, signed char a, signed char b, signed char 
c, signed char d, signed char e, signed char f, signed char g, signed char h) {
+  __v8qs v = (__v8qs)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] 
== f && v[6] == g && v[7] == h;
+}
+
+constexpr bool match_v8qu(__m64 _v, unsigned char a, unsigned char b, unsigned 
char c, unsigned char d, unsigned char e, unsigned char f, unsigned char g, 
unsigned char h) {
+  __v8qu v = (__v8qu)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] 
== f && v[6] == g && v[7] == h;
+}
+
+constexpr bool match_m128(__m128 _v, float a, float b, float c, float d) {
+  __v4su v = (__v4su)_v;
+  return v[0] == __builtin_bit_cast(unsigned, a) && v[1] == 
__builtin_bit_cast(unsigned, b) && v[2] == __builtin_bit_cast(unsigned, c) && 
v[3] == __builtin_bit_cast(unsigned, d);
+}
+
+constexpr bool match_m128d(__m128d _v, double a, double b) {
+  __v2du v = (__v2du)_v;
+  return v[0] == __builtin_bit_cast(unsigned long long, a) && v[1] == 
__builtin_bit_cast(unsigned long long, b);
+}
+
+#ifdef __SSE2__
+constexpr bool match_m128h(__m128h _v, _Float16 __e00, _Float16 __e01, 
_Float16 __e02, _Float16 __e03, _Float16 __e04, _Float16 __e05, _Float16 __e06, 
_Float16 __e07) {
+  __v8hu v = (__v8hu)_v;
+  return v[ 0] == __builtin_bit_cast(unsigned short, __e00) && v[ 1] == 
__builtin_bit_cast(unsigned short, __e01) && v[ 2] == 
__builtin_bit_cast(unsigned short, __e02) && v[ 3] == 
__builtin_bit_cast(unsigned short, __e03) &&
+         v[ 4] == __builtin_bit_cast(unsigned short, __e04) && v[ 5] == 
__builtin_bit_cast(unsigned short, __e05) && v[ 6] == 
__builtin_bit_cast(unsigned short, __e06) && v[ 7] == 
__builtin_bit_cast(unsigned short, __e07);
+}
+#endif
+
+constexpr bool match_m128i(__m128i _v, unsigned long long a, unsigned long 
long b) {
+  __v2du v = (__v2du)_v;
+  return v[0] == a && v[1] == b;
+}
+
+constexpr bool match_v2di(__m128i v, long long a, long long b) {
+  return v[0] == a && v[1] == b;
+}
+
+constexpr bool match_v2du(__m128i _v, unsigned long long a, unsigned long long 
b) {
+  __v2du v = (__v2du)_v;
+  return v[0] == a && v[1] == b;
+}
+
+constexpr bool match_v4si(__m128i _v, int a, int b, int c, int d) {
+  __v4si v = (__v4si)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d;
+}
+
+constexpr bool match_v4su(__m128i _v, unsigned a, unsigned b, unsigned c, 
unsigned d) {
+  __v4su v = (__v4su)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d;
+}
+
+constexpr bool match_v8hi(__m128i _v, short a, short b, short c, short d, 
short e, short f, short g, short h) {
+  __v8hi v = (__v8hi)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] 
== f && v[6] == g && v[7] == h;
+}
+
+constexpr bool match_v8hu(__m128i _v, unsigned short a, unsigned short b, 
unsigned short c, unsigned short d, unsigned short e, unsigned short f, 
unsigned short g, unsigned short h) {
+  __v8hu v = (__v8hu)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] 
== f && v[6] == g && v[7] == h;
+}
+
+constexpr bool match_v16qi(__m128i _v, signed char a, signed char b, signed 
char c, signed char d, signed char e, signed char f, signed char g, signed char 
h, signed char i, signed char j, signed char k, signed char l, signed char m, 
signed char n, signed char o, signed char p) {
+  __v16qs v = (__v16qs)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] 
== f && v[6] == g && v[7] == h && v[8] == i && v[9] == j && v[10] == k && v[11] 
== l && v[12] == m && v[13] == n && v[14] == o && v[15] == p;
+}
+
+constexpr bool match_v16qu(__m128i _v, unsigned char a, unsigned char b, 
unsigned char c, unsigned char d, unsigned char e, unsigned char f, unsigned 
char g, unsigned char h, unsigned char i, unsigned char j, unsigned char k, 
unsigned char l, unsigned char m, unsigned char n, unsigned char o, unsigned 
char p) {
+  __v16qu v = (__v16qu)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] 
== f && v[6] == g && v[7] == h && v[8] == i && v[9] == j && v[10] == k && v[11] 
== l && v[12] == m && v[13] == n && v[14] == o && v[15] == p;
+}
+
+constexpr bool match_m256(__m256 _v, float __e00, float __e01, float __e02, 
float __e03, float __e04, float __e05, float __e06, float __e07) {
+  __v8su v = (__v8su)_v;
+  return v[ 0] == __builtin_bit_cast(unsigned, __e00) && v[ 1] == 
__builtin_bit_cast(unsigned, __e01) && v[ 2] == __builtin_bit_cast(unsigned, 
__e02) && v[ 3] == __builtin_bit_cast(unsigned, __e03) &&
+         v[ 4] == __builtin_bit_cast(unsigned, __e04) && v[ 5] == 
__builtin_bit_cast(unsigned, __e05) && v[ 6] == __builtin_bit_cast(unsigned, 
__e06) && v[ 7] == __builtin_bit_cast(unsigned, __e07);
+}
+
+constexpr bool match_m256d(__m256d _v, double a, double b, double c, double d) 
{
+  __v4du v = (__v4du)_v;
+  return v[0] == __builtin_bit_cast(unsigned long long, a) && v[1] == 
__builtin_bit_cast(unsigned long long, b) && v[2] == 
__builtin_bit_cast(unsigned long long, c) && v[3] == 
__builtin_bit_cast(unsigned long long, d);
+}
+
+#ifdef __SSE2__
+constexpr bool match_m256h(__m256h _v, _Float16 __e00, _Float16 __e01, 
_Float16 __e02, _Float16 __e03, _Float16 __e04, _Float16 __e05, _Float16 __e06, 
_Float16 __e07,
+                                       _Float16 __e08, _Float16 __e09, 
_Float16 __e10, _Float16 __e11, _Float16 __e12, _Float16 __e13, _Float16 __e14, 
_Float16 __e15) {
+  __v16hu v = (__v16hu)_v;
+  return v[ 0] == __builtin_bit_cast(unsigned short, __e00) && v[ 1] == 
__builtin_bit_cast(unsigned short, __e01) && v[ 2] == 
__builtin_bit_cast(unsigned short, __e02) && v[ 3] == 
__builtin_bit_cast(unsigned short, __e03) &&
+         v[ 4] == __builtin_bit_cast(unsigned short, __e04) && v[ 5] == 
__builtin_bit_cast(unsigned short, __e05) && v[ 6] == 
__builtin_bit_cast(unsigned short, __e06) && v[ 7] == 
__builtin_bit_cast(unsigned short, __e07) &&
+         v[ 8] == __builtin_bit_cast(unsigned short, __e08) && v[ 9] == 
__builtin_bit_cast(unsigned short, __e09) && v[10] == 
__builtin_bit_cast(unsigned short, __e10) && v[11] == 
__builtin_bit_cast(unsigned short, __e11) &&
+         v[12] == __builtin_bit_cast(unsigned short, __e12) && v[13] == 
__builtin_bit_cast(unsigned short, __e13) && v[14] == 
__builtin_bit_cast(unsigned short, __e14) && v[15] == 
__builtin_bit_cast(unsigned short, __e15);
+}
+#endif
+
+constexpr bool match_m256i(__m256i _v, unsigned long long a, unsigned long 
long b, unsigned long long c, unsigned long long d) {
+  __v4du v = (__v4du)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d;
+}
+
+constexpr bool match_v4di(__m256i _v, long long a, long long b, long long c, 
long long d) {
+  __v4di v = (__v4di)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d;
+}
+
+constexpr bool match_v8si(__m256i _v, int a, int b, int c, int d, int e, int 
f, int g, int h) {
+  __v8si v = (__v8si)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] 
== f && v[6] == g && v[7] == h;
+}
+
+constexpr bool match_v8su(__m256i _v, unsigned a, unsigned b, unsigned c, 
unsigned d, unsigned e, unsigned f, unsigned g, unsigned h) {
+  __v8su v = (__v8su)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] 
== f && v[6] == g && v[7] == h;
+}
+
+constexpr bool match_v16hi(__m256i _v, short a, short b, short c, short d, 
short e, short f, short g, short h, short i, short j, short k, short l, short 
m, short n, short o, short p) {
+  __v16hi v = (__v16hi)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] 
== f && v[6] == g && v[7] == h && v[8] == i && v[9] == j && v[10] == k && v[11] 
== l && v[12] == m && v[13] == n && v[14] == o && v[15] == p;
+}
+
+constexpr bool match_v16hu(__m256i _v, unsigned short a, unsigned short b, 
unsigned short c, unsigned short d, unsigned short e, unsigned short f, 
unsigned short g, unsigned short h, unsigned short i, unsigned short j, 
unsigned short k, unsigned short l, unsigned short m, unsigned short n, 
unsigned short o, unsigned short p) {
+  __v16hu v = (__v16hu)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] 
== f && v[6] == g && v[7] == h && v[8] == i && v[9] == j && v[10] == k && v[11] 
== l && v[12] == m && v[13] == n && v[14] == o && v[15] == p;
+}
+
+constexpr bool match_v32qi(__m256i _v, signed char __b00, signed char __b01, 
signed char __b02, signed char __b03, signed char __b04, signed char __b05, 
signed char __b06, signed char __b07,
+                                       signed char __b08, signed char __b09, 
signed char __b10, signed char __b11, signed char __b12, signed char __b13, 
signed char __b14, signed char __b15,
+                                       signed char __b16, signed char __b17, 
signed char __b18, signed char __b19, signed char __b20, signed char __b21, 
signed char __b22, signed char __b23,
+                                       signed char __b24, signed char __b25, 
signed char __b26, signed char __b27, signed char __b28, signed char __b29, 
signed char __b30, signed char __b31) {
+  __v32qs v = (__v32qs)_v;
+  return v[ 0] == __b00 && v[ 1] == __b01 && v[ 2] == __b02 && v[ 3] == __b03 
&& v[ 4] == __b04 && v[ 5] == __b05 && v[ 6] == __b06 && v[ 7] ==  __b07 &&
+         v[ 8] == __b08 && v[ 9] == __b09 && v[10] == __b10 && v[11] == __b11 
&& v[12] == __b12 && v[13] == __b13 && v[14] == __b14 && v[15] ==  __b15 &&
+         v[16] == __b16 && v[17] == __b17 && v[18] == __b18 && v[19] == __b19 
&& v[20] == __b20 && v[21] == __b21 && v[22] == __b22 && v[23] ==  __b23 &&
+         v[24] == __b24 && v[25] == __b25 && v[26] == __b26 && v[27] == __b27 
&& v[28] == __b28 && v[29] == __b29 && v[30] == __b30 && v[31] ==  __b31;
+}
+
+constexpr bool match_v32qu(__m256i _v, unsigned char __b00, unsigned char 
__b01, unsigned char __b02, unsigned char __b03, unsigned char __b04, unsigned 
char __b05, unsigned char __b06, unsigned char __b07,
+                                       unsigned char __b08, unsigned char 
__b09, unsigned char __b10, unsigned char __b11, unsigned char __b12, unsigned 
char __b13, unsigned char __b14, unsigned char __b15,
+                                       unsigned char __b16, unsigned char 
__b17, unsigned char __b18, unsigned char __b19, unsigned char __b20, unsigned 
char __b21, unsigned char __b22, unsigned char __b23,
+                                       unsigned char __b24, unsigned char 
__b25, unsigned char __b26, unsigned char __b27, unsigned char __b28, unsigned 
char __b29, unsigned char __b30, unsigned char __b31) {
+  __v32qu v = (__v32qu)_v;
+  return v[ 0] == __b00 && v[ 1] == __b01 && v[ 2] == __b02 && v[ 3] == __b03 
&& v[ 4] == __b04 && v[ 5] == __b05 && v[ 6] == __b06 && v[ 7] ==  __b07 &&
+         v[ 8] == __b08 && v[ 9] == __b09 && v[10] == __b10 && v[11] == __b11 
&& v[12] == __b12 && v[13] == __b13 && v[14] == __b14 && v[15] ==  __b15 &&
+         v[16] == __b16 && v[17] == __b17 && v[18] == __b18 && v[19] == __b19 
&& v[20] == __b20 && v[21] == __b21 && v[22] == __b22 && v[23] ==  __b23 &&
+         v[24] == __b24 && v[25] == __b25 && v[26] == __b26 && v[27] == __b27 
&& v[28] == __b28 && v[29] == __b29 && v[30] == __b30 && v[31] ==  __b31;
+}
+
+constexpr bool match_m512(__m512 _v, float __e00, float __e01, float __e02, 
float __e03, float __e04, float __e05, float __e06, float __e07, float __e08, 
float __e09, float __e10, float __e11, float __e12, float __e13, float __e14, 
float __e15) {
+  __v16su v = (__v16su)_v;
+  return v[ 0] == __builtin_bit_cast(unsigned, __e00) && v[ 1] == 
__builtin_bit_cast(unsigned, __e01) && v[ 2] == __builtin_bit_cast(unsigned, 
__e02) && v[ 3] == __builtin_bit_cast(unsigned, __e03) &&
+         v[ 4] == __builtin_bit_cast(unsigned, __e04) && v[ 5] == 
__builtin_bit_cast(unsigned, __e05) && v[ 6] == __builtin_bit_cast(unsigned, 
__e06) && v[ 7] == __builtin_bit_cast(unsigned, __e07) &&
+         v[ 8] == __builtin_bit_cast(unsigned, __e08) && v[ 9] == 
__builtin_bit_cast(unsigned, __e09) && v[10] == __builtin_bit_cast(unsigned, 
__e10) && v[11] == __builtin_bit_cast(unsigned, __e11) &&
+         v[12] == __builtin_bit_cast(unsigned, __e12) && v[13] == 
__builtin_bit_cast(unsigned, __e13) && v[14] == __builtin_bit_cast(unsigned, 
__e14) && v[15] == __builtin_bit_cast(unsigned, __e15);
+}
+
+constexpr bool match_m512d(__m512d _v, double __e00, double __e01, double 
__e02, double __e03, double __e04, double __e05, double __e06, double __e07) {
+  __v8du v = (__v8du)_v;
+  return v[ 0] == __builtin_bit_cast(unsigned long long, __e00) && v[ 1] == 
__builtin_bit_cast(unsigned long long, __e01) && v[ 2] == 
__builtin_bit_cast(unsigned long long, __e02) && v[ 3] == 
__builtin_bit_cast(unsigned long long, __e03) &&
+         v[ 4] == __builtin_bit_cast(unsigned long long, __e04) && v[ 5] == 
__builtin_bit_cast(unsigned long long, __e05) && v[ 6] == 
__builtin_bit_cast(unsigned long long, __e06) && v[ 7] == 
__builtin_bit_cast(unsigned long long, __e07);
+}
+
+#ifdef __SSE2__
+constexpr bool match_m512h(__m512h _v, _Float16 __e00, _Float16 __e01, 
_Float16 __e02, _Float16 __e03, _Float16 __e04, _Float16 __e05, _Float16 __e06, 
_Float16 __e07,
+                                       _Float16 __e08, _Float16 __e09, 
_Float16 __e10, _Float16 __e11, _Float16 __e12, _Float16 __e13, _Float16 __e14, 
_Float16 __e15,
+                                       _Float16 __e16, _Float16 __e17, 
_Float16 __e18, _Float16 __e19, _Float16 __e20, _Float16 __e21, _Float16 __e22, 
_Float16 __e23,
+                                       _Float16 __e24, _Float16 __e25, 
_Float16 __e26, _Float16 __e27, _Float16 __e28, _Float16 __e29, _Float16 __e30, 
_Float16 __e31) {
+  __v32hu v = (__v32hu)_v;
+  return v[ 0] == __builtin_bit_cast(unsigned short, __e00) && v[ 1] == 
__builtin_bit_cast(unsigned short, __e01) && v[ 2] == 
__builtin_bit_cast(unsigned short, __e02) && v[ 3] == 
__builtin_bit_cast(unsigned short, __e03) &&
+         v[ 4] == __builtin_bit_cast(unsigned short, __e04) && v[ 5] == 
__builtin_bit_cast(unsigned short, __e05) && v[ 6] == 
__builtin_bit_cast(unsigned short, __e06) && v[ 7] == 
__builtin_bit_cast(unsigned short, __e07) &&
+         v[ 8] == __builtin_bit_cast(unsigned short, __e08) && v[ 9] == 
__builtin_bit_cast(unsigned short, __e09) && v[10] == 
__builtin_bit_cast(unsigned short, __e10) && v[11] == 
__builtin_bit_cast(unsigned short, __e11) &&
+         v[12] == __builtin_bit_cast(unsigned short, __e12) && v[13] == 
__builtin_bit_cast(unsigned short, __e13) && v[14] == 
__builtin_bit_cast(unsigned short, __e14) && v[15] == 
__builtin_bit_cast(unsigned short, __e15) &&
+         v[16] == __builtin_bit_cast(unsigned short, __e16) && v[17] == 
__builtin_bit_cast(unsigned short, __e17) && v[18] == 
__builtin_bit_cast(unsigned short, __e18) && v[19] == 
__builtin_bit_cast(unsigned short, __e19) &&
+         v[20] == __builtin_bit_cast(unsigned short, __e20) && v[21] == 
__builtin_bit_cast(unsigned short, __e21) && v[22] == 
__builtin_bit_cast(unsigned short, __e22) && v[23] == 
__builtin_bit_cast(unsigned short, __e23) &&
+         v[24] == __builtin_bit_cast(unsigned short, __e24) && v[25] == 
__builtin_bit_cast(unsigned short, __e25) && v[26] == 
__builtin_bit_cast(unsigned short, __e26) && v[27] == 
__builtin_bit_cast(unsigned short, __e27) &&
+         v[28] == __builtin_bit_cast(unsigned short, __e28) && v[29] == 
__builtin_bit_cast(unsigned short, __e29) && v[30] == 
__builtin_bit_cast(unsigned short, __e30) && v[31] == 
__builtin_bit_cast(unsigned short, __e31);
+}
+#endif
+
+constexpr bool match_m512i(__m512i _v, unsigned long long a, unsigned long 
long b, unsigned long long c, unsigned long long d, unsigned long long e, 
unsigned long long f, unsigned long long g, unsigned long long h) {
+  __v8du v = (__v8du)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] 
== f && v[6] == g && v[7] == h;
+}
+
+constexpr bool match_v8di(__m512i _v, long long a, long long b, long long c, 
long long d, long long e, long long f, long long g, long long h) {
+  __v8di v = (__v8di)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] 
== f && v[6] == g && v[7] == h;
+}
+
+constexpr bool match_v16si(__m512i _v, int a, int b, int c, int d, int e, int 
f, int g, int h, int i, int j, int k, int l, int m, int n, int o, int p) {
+  __v16si v = (__v16si)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] 
== f && v[6] == g && v[7] == h && v[8] == i && v[9] == j && v[10] == k && v[11] 
== l && v[12] == m && v[13] == n && v[14] == o && v[15] == p;
+}
+
+constexpr bool match_v16su(__m512i _v, unsigned int a, unsigned int b, 
unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, 
unsigned int h, unsigned int i, unsigned int j, unsigned int k, unsigned int l, 
unsigned int m, unsigned int n, unsigned int o, unsigned int p) {
+  __v16su v = (__v16su)_v;
+  return v[0] == a && v[1] == b && v[2] == c && v[3] == d && v[4] == e && v[5] 
== f && v[6] == g && v[7] == h && v[8] == i && v[9] == j && v[10] == k && v[11] 
== l && v[12] == m && v[13] == n && v[14] == o && v[15] == p;
+}
+
+constexpr bool match_v32hi(__m512i _v, short __e00, short __e01, short __e02, 
short __e03, short __e04, short __e05, short __e06, short __e07,
+                                       short __e08, short __e09, short __e10, 
short __e11, short __e12, short __e13, short __e14, short __e15,
+                                       short __e16, short __e17, short __e18, 
short __e19, short __e20, short __e21, short __e22, short __e23,
+                                       short __e24, short __e25, short __e26, 
short __e27, short __e28, short __e29, short __e30, short __e31) {
+  __v32hi v = (__v32hi)_v;
+  return v[ 0] == __e00 && v[ 1] == __e01 && v[ 2] == __e02 && v[ 3] == __e03 
&& v[ 4] == __e04 && v[ 5] == __e05 && v[ 6] == __e06 && v[ 7] ==  __e07 &&
+         v[ 8] == __e08 && v[ 9] == __e09 && v[10] == __e10 && v[11] == __e11 
&& v[12] == __e12 && v[13] == __e13 && v[14] == __e14 && v[15] ==  __e15 &&
+         v[16] == __e16 && v[17] == __e17 && v[18] == __e18 && v[19] == __e19 
&& v[20] == __e20 && v[21] == __e21 && v[22] == __e22 && v[23] ==  __e23 &&
+         v[24] == __e24 && v[25] == __e25 && v[26] == __e26 && v[27] == __e27 
&& v[28] == __e28 && v[29] == __e29 && v[30] == __e30 && v[31] ==  __e31;
+}
+
+constexpr bool match_v32hu(__m512i _v, unsigned short __e00, unsigned short 
__e01, unsigned short __e02, unsigned short __e03, unsigned short __e04, 
unsigned short __e05, unsigned short __e06, unsigned short __e07,
+                                       unsigned short __e08, unsigned short 
__e09, unsigned short __e10, unsigned short __e11, unsigned short __e12, 
unsigned short __e13, unsigned short __e14, unsigned short __e15,
+                                       unsigned short __e16, unsigned short 
__e17, unsigned short __e18, unsigned short __e19, unsigned short __e20, 
unsigned short __e21, unsigned short __e22, unsigned short __e23,
+                                       unsigned short __e24, unsigned short 
__e25, unsigned short __e26, unsigned short __e27, unsigned short __e28, 
unsigned short __e29, unsigned short __e30, unsigned short __e31) {
+  __v32hu v = (__v32hu)_v;
+  return v[ 0] == __e00 && v[ 1] == __e01 && v[ 2] == __e02 && v[ 3] == __e03 
&& v[ 4] == __e04 && v[ 5] == __e05 && v[ 6] == __e06 && v[ 7] ==  __e07 &&
+         v[ 8] == __e08 && v[ 9] == __e09 && v[10] == __e10 && v[11] == __e11 
&& v[12] == __e12 && v[13] == __e13 && v[14] == __e14 && v[15] ==  __e15 &&
+         v[16] == __e16 && v[17] == __e17 && v[18] == __e18 && v[19] == __e19 
&& v[20] == __e20 && v[21] == __e21 && v[22] == __e22 && v[23] ==  __e23 &&
+         v[24] == __e24 && v[25] == __e25 && v[26] == __e26 && v[27] == __e27 
&& v[28] == __e28 && v[29] == __e29 && v[30] == __e30 && v[31] ==  __e31;
+}
+
+constexpr bool match_v64qi(__m512i _v, signed char __e00, signed char __e01, 
signed char __e02, signed char __e03, signed char __e04, signed char __e05, 
signed char __e06, signed char __e07,
+                                       signed char __e08, signed char __e09, 
signed char __e10, signed char __e11, signed char __e12, signed char __e13, 
signed char __e14, signed char __e15,
+                                       signed char __e16, signed char __e17, 
signed char __e18, signed char __e19, signed char __e20, signed char __e21, 
signed char __e22, signed char __e23,
+                                       signed char __e24, signed char __e25, 
signed char __e26, signed char __e27, signed char __e28, signed char __e29, 
signed char __e30, signed char __e31,
+                                       signed char __e32, signed char __e33, 
signed char __e34, signed char __e35, signed char __e36, signed char __e37, 
signed char __e38, signed char __e39,
+                                       signed char __e40, signed char __e41, 
signed char __e42, signed char __e43, signed char __e44, signed char __e45, 
signed char __e46, signed char __e47,
+                                       signed char __e48, signed char __e49, 
signed char __e50, signed char __e51, signed char __e52, signed char __e53, 
signed char __e54, signed char __e55,
+                                       signed char __e56, signed char __e57, 
signed char __e58, signed char __e59, signed char __e60, signed char __e61, 
signed char __e62, signed char __e63) {
+  __v64qs v = (__v64qs)_v;
+  return v[ 0] == __e00 && v[ 1] == __e01 && v[ 2] == __e02 && v[ 3] == __e03 
&& v[ 4] == __e04 && v[ 5] == __e05 && v[ 6] == __e06 && v[ 7] == __e07 &&
+         v[ 8] == __e08 && v[ 9] == __e09 && v[10] == __e10 && v[11] == __e11 
&& v[12] == __e12 && v[13] == __e13 && v[14] == __e14 && v[15] == __e15 &&
+         v[16] == __e16 && v[17] == __e17 && v[18] == __e18 && v[19] == __e19 
&& v[20] == __e20 && v[21] == __e21 && v[22] == __e22 && v[23] == __e23 &&
+         v[24] == __e24 && v[25] == __e25 && v[26] == __e26 && v[27] == __e27 
&& v[28] == __e28 && v[29] == __e29 && v[30] == __e30 && v[31] == __e31 &&
+         v[32] == __e32 && v[33] == __e33 && v[34] == __e34 && v[35] == __e35 
&& v[36] == __e36 && v[37] == __e37 && v[38] == __e38 && v[39] == __e39 &&
+         v[40] == __e40 && v[41] == __e41 && v[42] == __e42 && v[43] == __e43 
&& v[44] == __e44 && v[45] == __e45 && v[46] == __e46 && v[47] == __e47 &&
+         v[48] == __e48 && v[49] == __e49 && v[50] == __e50 && v[51] == __e51 
&& v[52] == __e52 && v[53] == __e53 && v[54] == __e54 && v[55] == __e55 &&
+         v[56] == __e56 && v[57] == __e57 && v[58] == __e58 && v[59] == __e59 
&& v[60] == __e60 && v[61] == __e61 && v[62] == __e62 && v[63] == __e63;
+}
+
+constexpr bool match_v64qu(__m512i _v, unsigned char __e00, unsigned char 
__e01, unsigned char __e02, unsigned char __e03, unsigned char __e04, unsigned 
char __e05, unsigned char __e06, unsigned char __e07,
+                                       unsigned char __e08, unsigned char 
__e09, unsigned char __e10, unsigned char __e11, unsigned char __e12, unsigned 
char __e13, unsigned char __e14, unsigned char __e15,
+                                       unsigned char __e16, unsigned char 
__e17, unsigned char __e18, unsigned char __e19, unsigned char __e20, unsigned 
char __e21, unsigned char __e22, unsigned char __e23,
+                                       unsigned char __e24, unsigned char 
__e25, unsigned char __e26, unsigned char __e27, unsigned char __e28, unsigned 
char __e29, unsigned char __e30, unsigned char __e31,
+                                       unsigned char __e32, unsigned char 
__e33, unsigned char __e34, unsigned char __e35, unsigned char __e36, unsigned 
char __e37, unsigned char __e38, unsigned char __e39,
+                                       unsigned char __e40, unsigned char 
__e41, unsigned char __e42, unsigned char __e43, unsigned char __e44, unsigned 
char __e45, unsigned char __e46, unsigned char __e47,
+                                       unsigned char __e48, unsigned char 
__e49, unsigned char __e50, unsigned char __e51, unsigned char __e52, unsigned 
char __e53, unsigned char __e54, unsigned char __e55,
+                                       unsigned char __e56, unsigned char 
__e57, unsigned char __e58, unsigned char __e59, unsigned char __e60, unsigned 
char __e61, unsigned char __e62, unsigned char __e63) {
+  __v64qu v = (__v64qu)_v;
+  return v[ 0] == __e00 && v[ 1] == __e01 && v[ 2] == __e02 && v[ 3] == __e03 
&& v[ 4] == __e04 && v[ 5] == __e05 && v[ 6] == __e06 && v[ 7] == __e07 &&
+         v[ 8] == __e08 && v[ 9] == __e09 && v[10] == __e10 && v[11] == __e11 
&& v[12] == __e12 && v[13] == __e13 && v[14] == __e14 && v[15] == __e15 &&
+         v[16] == __e16 && v[17] == __e17 && v[18] == __e18 && v[19] == __e19 
&& v[20] == __e20 && v[21] == __e21 && v[22] == __e22 && v[23] == __e23 &&
+         v[24] == __e24 && v[25] == __e25 && v[26] == __e26 && v[27] == __e27 
&& v[28] == __e28 && v[29] == __e29 && v[30] == __e30 && v[31] == __e31 &&
+         v[32] == __e32 && v[33] == __e33 && v[34] == __e34 && v[35] == __e35 
&& v[36] == __e36 && v[37] == __e37 && v[38] == __e38 && v[39] == __e39 &&
+         v[40] == __e40 && v[41] == __e41 && v[42] == __e42 && v[43] == __e43 
&& v[44] == __e44 && v[45] == __e45 && v[46] == __e46 && v[47] == __e47 &&
+         v[48] == __e48 && v[49] == __e49 && v[50] == __e50 && v[51] == __e51 
&& v[52] == __e52 && v[53] == __e53 && v[54] == __e54 && v[55] == __e55 &&
+         v[56] == __e56 && v[57] == __e57 && v[58] == __e58 && v[59] == __e59 
&& v[60] == __e60 && v[61] == __e61 && v[62] == __e62 && v[63] == __e63;
+}
+
+#define TEST_CONSTEXPR(...) static_assert(__VA_ARGS__)
+
+#else
+
+#define TEST_CONSTEXPR(...)
+
+#endif
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/xop-builtin.c 
b/clang/test/CIR/CodeGenBuiltins/X86/xop-builtin.c
new file mode 100644
index 0000000000000..378e8c71fa378
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/xop-builtin.c
@@ -0,0 +1,82 @@
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror 
| FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o 
- -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=i386-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | 
FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=i386-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - 
-Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror 
| FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o 
- -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=i386-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | 
FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=i386-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - 
-Wall -Werror | FileCheck %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror 
-fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o 
- -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=i386-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror 
-fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=i386-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - 
-Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror 
-fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o 
- -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=i386-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror 
-fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=i386-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - 
-Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+
+
+#include <x86intrin.h>
+#include "builtin_test_helpers.h"
+
+// This test mimics clang/test/CodeGen/X86/xop-builtins.c, which eventually
+// CIR shall be able to support fully.
+
+__m128i test_mm_rot_epi8(__m128i a, __m128i b) {
+  // CHECK-LABEL: test_mm_rot_epi8
+  // CHECK: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %{{.*}}, <16 x i8> 
%{{.*}}, <16 x i8> %{{.*}})
+  return _mm_rot_epi8(a, b);
+}
+TEST_CONSTEXPR(match_v16qi(_mm_rot_epi8((__m128i)(__v16qs){15, -14, -13, -12, 
11, 10, 9, 8, 7, 6, 5, -4, 3, -2, 1, 0}, (__m128i)(__v16qs){0, 1, -2, 3, -4, 5, 
-6, 7, -8, 9, -10, 11, -12, 13, -14, 15}), 15, -27, -4, -89, -80, 65, 36, 4, 7, 
12, 65, -25, 48, -33, 4, 0));
+
+__m128i test_mm_rot_epi16(__m128i a, __m128i b) {
+  // CHECK-LABEL: test_mm_rot_epi16
+  // CHECK: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %{{.*}}, <8 x i16> 
%{{.*}}, <8 x i16> %{{.*}})
+  return _mm_rot_epi16(a, b);
+}
+TEST_CONSTEXPR(match_v8hi(_mm_rot_epi16((__m128i)(__v8hi){7, 6, 5, -4, 3, -2, 
1, 0}, (__m128i)(__v8hi){0, 1, -2, 3, -4, 5, -6, 7}), 7, 12, 16385, -25, 12288, 
-33, 1024, 0));
+
+__m128i test_mm_rot_epi32(__m128i a, __m128i b) {
+  // CHECK-LABEL: test_mm_rot_epi32
+  // CHECK: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %{{.*}}, <4 x i32> 
%{{.*}}, <4 x i32> %{{.*}})
+  return _mm_rot_epi32(a, b);
+}
+TEST_CONSTEXPR(match_v4si(_mm_rot_epi32((__m128i)(__v4si){3, -2, 1, 0}, 
(__m128i)(__v4si){0, 1, -2, 3}), 3, -3, 1073741824, 0));
+
+__m128i test_mm_rot_epi64(__m128i a, __m128i b) {
+  // CHECK-LABEL: test_mm_rot_epi64
+  // CHECK: call {{.*}}<2 x i64> @llvm.fshl.v2i64(<2 x i64> %{{.*}}, <2 x i64> 
%{{.*}}, <2 x i64> %{{.*}})
+  return _mm_rot_epi64(a, b);
+}
+TEST_CONSTEXPR(match_v2di(_mm_rot_epi64((__m128i)(__v2di){99, -55}, 
(__m128i)(__v2di){1, -2}), 198, 9223372036854775794LL));
+
+__m128i test_mm_roti_epi8(__m128i a) {
+  // CHECK-LABEL: test_mm_roti_epi8
+  // CHECK: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %{{.*}}, <16 x i8> 
%{{.*}}, <16 x i8> splat (i8 1))
+  return _mm_roti_epi8(a, 1);
+}
+TEST_CONSTEXPR(match_v16qi(_mm_roti_epi8(((__m128i)(__v16qs){0, 1, -2, 3, -4, 
5, -6, 7, -8, 9, -10, 11, -12, 13, -14, 15}), 3), 0, 8, -9, 24, -25, 40, -41, 
56, -57, 72, -73, 88, -89, 104, -105, 120));
+
+__m128i test_mm_roti_epi16(__m128i a) {
+  // CHECK-LABEL: test_mm_roti_epi16
+  // CHECK: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %{{.*}}, <8 x i16> 
%{{.*}}, <8 x i16> splat (i16 50))
+  return _mm_roti_epi16(a, 50);
+}
+TEST_CONSTEXPR(match_v8hi(_mm_roti_epi16(((__m128i)(__v8hi){2, -3, 4, -5, 6, 
-7, 8, -9}), 1), 4, -5, 8, -9, 12, -13, 16, -17));
+
+__m128i test_mm_roti_epi32(__m128i a) {
+  // CHECK-LABEL: test_mm_roti_epi32
+  // CHECK: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %{{.*}}, <4 x i32> 
%{{.*}}, <4 x i32> splat (i32 226))
+  return _mm_roti_epi32(a, -30);
+}
+TEST_CONSTEXPR(match_v4si(_mm_roti_epi32(((__m128i)(__v4si){1, -2, 3, -4}), 
5), 32, -33, 96, -97));
+
+__m128i test_mm_roti_epi64(__m128i a) {
+  // CHECK-LABEL: test_mm_roti_epi64
+  // CHECK: call {{.*}}<2 x i64> @llvm.fshl.v2i64(<2 x i64> %{{.*}}, <2 x i64> 
%{{.*}}, <2 x i64> splat (i64 100))
+  return _mm_roti_epi64(a, 100);
+}
+TEST_CONSTEXPR(match_v2di(_mm_roti_epi64(((__m128i)(__v2di){99, -55}), 19), 
51904512, -28311553));
+
+

>From 2f932f03b582c99049005d7b4d200460846702cc Mon Sep 17 00:00:00 2001
From: Omar Ibrahim <[email protected]>
Date: Sun, 23 Nov 2025 15:47:29 +0100
Subject: [PATCH 03/10] remove local file, fix formatting issues

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index e844c41a80577..0797490aa6913 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -126,15 +126,14 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   // evaluation.
   assert(!cir::MissingFeatures::msvcBuiltins());
 
-  // Find out if any arguments are required to be integer constant
-  // expressions.
+  // Find out if any arguments are required to be integer constant expressions.
   assert(!cir::MissingFeatures::handleBuiltinICEArguments());
 
   // The operands of the builtin call
   llvm::SmallVector<mlir::Value> ops;
 
-  // `ICEArguments` is a bitmap indicating whether the argument at the i-th
-  // bit is required to be a constant integer expression.
+  // `ICEArguments` is a bitmap indicating whether the argument at the i-th bit
+  // is required to be a constant integer expression.
   unsigned iceArguments = 0;
   ASTContext::GetBuiltinTypeError error;
   getContext().GetBuiltinType(builtinID, error, &iceArguments);

>From bb1f8fa5ca28dccc9cbeed9a704031938eae74fd Mon Sep 17 00:00:00 2001
From: Omar Ibrahim <[email protected]>
Date: Sun, 23 Nov 2025 15:55:17 +0100
Subject: [PATCH 04/10] remove local file

---
 shell.nix | 14 --------------
 1 file changed, 14 deletions(-)
 delete mode 100644 shell.nix

diff --git a/shell.nix b/shell.nix
deleted file mode 100644
index c30f6dc7b6928..0000000000000
--- a/shell.nix
+++ /dev/null
@@ -1,14 +0,0 @@
-let
-  nixpkgs = fetchTarball 
"https://github.com/NixOS/nixpkgs/tarball/nixos-24.05";;
-  pkgs = import nixpkgs { config = {}; overlays = []; };
-in
-
-
-pkgs.mkShellNoCC {
-  packages = with pkgs; [
-    cmake
-    ninja
-    llvmPackages_latest.llvm
-  ];
-stdenv = pkgs.clangStdenv;
-}

>From 87ec5f4ffa02512ce9cc70fd1acf9cb03aa917d7 Mon Sep 17 00:00:00 2001
From: Omar Ibrahim <[email protected]>
Date: Mon, 24 Nov 2025 21:03:14 +0100
Subject: [PATCH 05/10] backup

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    |   4 +-
 .../CIR/CodeGenBuiltins/X86/xop-builtin.c     | 108 ++++++------------
 2 files changed, 36 insertions(+), 76 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 0797490aa6913..e478eee4253e5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -683,7 +683,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_prolq128:
   case X86::BI__builtin_ia32_prolq256:
   case X86::BI__builtin_ia32_prolq512:
-    return emitX86FunnelShift(*this, getLoc(expr->getExprLoc()), ops[0], 
ops[1],
+    return emitX86FunnelShift(*this, getLoc(expr->getExprLoc()), ops[0], 
ops[0],
                               ops[1], false);
   case X86::BI__builtin_ia32_prord128:
   case X86::BI__builtin_ia32_prord256:
@@ -691,7 +691,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_prorq128:
   case X86::BI__builtin_ia32_prorq256:
   case X86::BI__builtin_ia32_prorq512:
-    return emitX86FunnelShift(*this, getLoc(expr->getExprLoc()), ops[0], 
ops[1],
+    return emitX86FunnelShift(*this, getLoc(expr->getExprLoc()), ops[0], 
ops[0],
                               ops[1], true);
   case X86::BI__builtin_ia32_selectb_128:
   case X86::BI__builtin_ia32_selectb_256:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/xop-builtin.c 
b/clang/test/CIR/CodeGenBuiltins/X86/xop-builtin.c
index 378e8c71fa378..a90ccd8b60461 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/xop-builtin.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/xop-builtin.c
@@ -1,82 +1,42 @@
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror 
| FileCheck %s
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o 
- -Wall -Werror | FileCheck %s
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=i386-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | 
FileCheck %s
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=i386-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - 
-Wall -Werror | FileCheck %s
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror 
| FileCheck %s
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o 
- -Wall -Werror | FileCheck %s
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=i386-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror | 
FileCheck %s
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=i386-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - 
-Wall -Werror | FileCheck %s
-
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror 
-fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o 
- -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=i386-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror 
-fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=i386-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - 
-Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror 
-fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o 
- -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=i386-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror 
-fexperimental-new-constant-interpreter | FileCheck %s
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=i386-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - 
-Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
-
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o 
%t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir 
-emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o 
%t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir 
-emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror 
| FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o 
- -Wall -Werror | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror 
| FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o 
- -Wall -Werror | FileCheck %s -check-prefix=OGCG
 
 #include <x86intrin.h>
-#include "builtin_test_helpers.h"
 
 // This test mimics clang/test/CodeGen/X86/xop-builtins.c, which eventually
 // CIR shall be able to support fully.
 
-__m128i test_mm_rot_epi8(__m128i a, __m128i b) {
-  // CHECK-LABEL: test_mm_rot_epi8
-  // CHECK: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %{{.*}}, <16 x i8> 
%{{.*}}, <16 x i8> %{{.*}})
-  return _mm_rot_epi8(a, b);
-}
-TEST_CONSTEXPR(match_v16qi(_mm_rot_epi8((__m128i)(__v16qs){15, -14, -13, -12, 
11, 10, 9, 8, 7, 6, 5, -4, 3, -2, 1, 0}, (__m128i)(__v16qs){0, 1, -2, 3, -4, 5, 
-6, 7, -8, 9, -10, 11, -12, 13, -14, 15}), 15, -27, -4, -89, -80, 65, 36, 4, 7, 
12, 65, -25, 48, -33, 4, 0));
-
-__m128i test_mm_rot_epi16(__m128i a, __m128i b) {
-  // CHECK-LABEL: test_mm_rot_epi16
-  // CHECK: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %{{.*}}, <8 x i16> 
%{{.*}}, <8 x i16> %{{.*}})
-  return _mm_rot_epi16(a, b);
-}
-TEST_CONSTEXPR(match_v8hi(_mm_rot_epi16((__m128i)(__v8hi){7, 6, 5, -4, 3, -2, 
1, 0}, (__m128i)(__v8hi){0, 1, -2, 3, -4, 5, -6, 7}), 7, 12, 16385, -25, 12288, 
-33, 1024, 0));
-
-__m128i test_mm_rot_epi32(__m128i a, __m128i b) {
-  // CHECK-LABEL: test_mm_rot_epi32
-  // CHECK: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %{{.*}}, <4 x i32> 
%{{.*}}, <4 x i32> %{{.*}})
-  return _mm_rot_epi32(a, b);
-}
-TEST_CONSTEXPR(match_v4si(_mm_rot_epi32((__m128i)(__v4si){3, -2, 1, 0}, 
(__m128i)(__v4si){0, 1, -2, 3}), 3, -3, 1073741824, 0));
-
-__m128i test_mm_rot_epi64(__m128i a, __m128i b) {
-  // CHECK-LABEL: test_mm_rot_epi64
-  // CHECK: call {{.*}}<2 x i64> @llvm.fshl.v2i64(<2 x i64> %{{.*}}, <2 x i64> 
%{{.*}}, <2 x i64> %{{.*}})
-  return _mm_rot_epi64(a, b);
-}
-TEST_CONSTEXPR(match_v2di(_mm_rot_epi64((__m128i)(__v2di){99, -55}, 
(__m128i)(__v2di){1, -2}), 198, 9223372036854775794LL));
-
 __m128i test_mm_roti_epi8(__m128i a) {
-  // CHECK-LABEL: test_mm_roti_epi8
-  // CHECK: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %{{.*}}, <16 x i8> 
%{{.*}}, <16 x i8> splat (i8 1))
+  // CIR-LABEL: test_mm_roti_epi8
+  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{[us]}}8i, !cir.vector<16 x 
!{{[us]}}8i> 
+  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<16 x 
!{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>) -> 
!cir.vector<16 x !{{[su]}}8i> 
+  // LLVM-LABEL: test_mm_roti_epi8
+  // LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <16 x i8>
+  // LLVM: {{%.*}} = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> 
%[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1))
+  // OGCG-LABEL: test_mm_roti_epi8
+  // OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <16 x i8>
+  // OGCG: {{%.*}} = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> 
%[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1))
   return _mm_roti_epi8(a, 1);
-}
-TEST_CONSTEXPR(match_v16qi(_mm_roti_epi8(((__m128i)(__v16qs){0, 1, -2, 3, -4, 
5, -6, 7, -8, 9, -10, 11, -12, 13, -14, 15}), 3), 0, 8, -9, 24, -25, 40, -41, 
56, -57, 72, -73, 88, -89, 104, -105, 120));
-
-__m128i test_mm_roti_epi16(__m128i a) {
-  // CHECK-LABEL: test_mm_roti_epi16
-  // CHECK: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %{{.*}}, <8 x i16> 
%{{.*}}, <8 x i16> splat (i16 50))
-  return _mm_roti_epi16(a, 50);
-}
-TEST_CONSTEXPR(match_v8hi(_mm_roti_epi16(((__m128i)(__v8hi){2, -3, 4, -5, 6, 
-7, 8, -9}), 1), 4, -5, 8, -9, 12, -13, 16, -17));
-
-__m128i test_mm_roti_epi32(__m128i a) {
-  // CHECK-LABEL: test_mm_roti_epi32
-  // CHECK: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %{{.*}}, <4 x i32> 
%{{.*}}, <4 x i32> splat (i32 226))
-  return _mm_roti_epi32(a, -30);
-}
-TEST_CONSTEXPR(match_v4si(_mm_roti_epi32(((__m128i)(__v4si){1, -2, 3, -4}), 
5), 32, -33, 96, -97));
-
-__m128i test_mm_roti_epi64(__m128i a) {
-  // CHECK-LABEL: test_mm_roti_epi64
-  // CHECK: call {{.*}}<2 x i64> @llvm.fshl.v2i64(<2 x i64> %{{.*}}, <2 x i64> 
%{{.*}}, <2 x i64> splat (i64 100))
-  return _mm_roti_epi64(a, 100);
-}
-TEST_CONSTEXPR(match_v2di(_mm_roti_epi64(((__m128i)(__v2di){99, -55}), 19), 
51904512, -28311553));
-
-
+ }

>From b12cd83beb22f4af229adda279b8c377dc10f463 Mon Sep 17 00:00:00 2001
From: Omar Ibrahim <[email protected]>
Date: Tue, 25 Nov 2025 21:19:58 +0100
Subject: [PATCH 06/10] add cir tests, cast signed amts to unsigned

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    | 15 +++-
 .../CIR/CodeGenBuiltins/X86/xop-builtin.c     | 37 +++++++++
 .../CIR/CodeGenBuiltins/X86/xop-builtins.c    | 79 +++++++++++++++++++
 3 files changed, 128 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/CIR/CodeGenBuiltins/X86/xop-builtins.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index e478eee4253e5..5b80aea16e771 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -17,6 +17,7 @@
 #include "mlir/IR/ValueRange.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/MissingFeatures.h"
 
 using namespace clang;
@@ -97,9 +98,17 @@ static mlir::Value emitX86FunnelShift(CIRGenFunction &cgf,
   // Funnel shifts amounts are treated as modulo and types are all power-of-2
   // so we only care about the lowest log2 bits anyway.
   if (amt.getType() != ty) {
-    amt = cgf.getBuilder().createIntCast(
-        amt, mlir::cast<cir::VectorType>(ty).getElementType());
-    amt = cir::VecSplatOp::create(cgf.getBuilder(), location, ty, amt);
+    auto vecTy = mlir::cast<cir::VectorType>(ty);
+
+    auto numElems = vecTy.getSize();
+    cir::IntType vecElemType = 
mlir::cast<cir::IntType>(vecTy.getElementType());
+    auto signlessType =
+        cir::IntType::get(&cgf.getMLIRContext(), vecElemType.getWidth(), 
false);
+    amt = cgf.getBuilder().createIntCast(amt, signlessType);
+
+    amt = cir::VecSplatOp::create(cgf.getBuilder(), 
cgf.getLoc(e->getExprLoc()),
+                                  cir::VectorType::get(signlessType, numElems),
+                                  amt);
   }
 
   const std::string intrinsicName = isRight ? "fshr" : "fshl";
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/xop-builtin.c 
b/clang/test/CIR/CodeGenBuiltins/X86/xop-builtin.c
index a90ccd8b60461..c8ae5eb0fd82d 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/xop-builtin.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/xop-builtin.c
@@ -39,4 +39,41 @@ __m128i test_mm_roti_epi8(__m128i a) {
   // OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <16 x i8>
   // OGCG: {{%.*}} = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> 
%[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1))
   return _mm_roti_epi8(a, 1);
+}
+
+__m128i test_mm_roti_epi16(__m128i a) {
+  // CIR-LABEL: test_mm_roti_epi16
+  // CIR: {{%.*}} = cir.cast integral {{%.*}} : !{{[us]}}8i -> !u16i
+  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{[us]}}16i, !cir.vector<8 x 
!{{[us]}}16i> 
+  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<8 x 
!{{[su]}}16i>, !cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !{{[su]}}16i>) -> 
!cir.vector<8 x !{{[su]}}16i> 
+  // LLVM-LABEL: test_mm_roti_epi16
+  // LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <8 x i16>
+  // LLVM: {{%.*}} = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> 
%[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50))
+  // OGCG-LABEL: test_mm_roti_epi16
+  // OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <8 x i16>
+  // OGCG: {{%.*}} = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> 
%[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50))
+  return _mm_roti_epi16(a, 50);
+ }
+
+//NOTE: This only works as I expect for CIR but not for LLVMIR
+__m128i test_mm_roti_epi32(__m128i a) {
+  // CIR-LABEL: test_mm_roti_epi32
+  // CIR: {{%.*}} = cir.cast integral {{%.*}} : !{{[us]}}8i -> !u32i
+  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{[us]}}32i, !cir.vector<4 x 
!{{[us]}}32i> 
+  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<4 x 
!{{[su]}}32i>, !cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !{{[su]}}32i>) -> 
!cir.vector<4 x !{{[su]}}32i> 
+  return _mm_roti_epi32(a, -30);
+ }
+
+__m128i test_mm_roti_epi64(__m128i a) {
+  // CIR-LABEL: test_mm_roti_epi64
+  // CIR: {{%.*}} = cir.cast integral {{%.*}} : !{{[us]}}8i -> !u64i
+  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{.}}64i, !cir.vector<2 x 
!{{[us]}}64i> 
+  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<2 x 
!{{[su]}}64i>, !cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !u64i>) -> 
!cir.vector<2 x !{{[su]}}64i> 
+  // LLVM-LABEL: test_mm_roti_epi64
+  // LLVM: %[[VAR:.*]] = load <2 x i64>, ptr {{%.*}}, align 16
+  // LLVM: {{%.*}} = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x 
i64> %[[VAR]], <2 x i64> splat (i64 100))
+  // OGCG-LABEL: test_mm_roti_epi64
+  // OGCG: %[[VAR:.*]] = load <2 x i64>, ptr {{%.*}}, align 16
+  // OGCG: {{%.*}} = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x 
i64> %[[VAR]], <2 x i64> splat (i64 100))
+  return _mm_roti_epi64(a, 100);
  }
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/xop-builtins.c 
b/clang/test/CIR/CodeGenBuiltins/X86/xop-builtins.c
new file mode 100644
index 0000000000000..c8ae5eb0fd82d
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/xop-builtins.c
@@ -0,0 +1,79 @@
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o 
%t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir 
-emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -emit-cir -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-cir -o 
%t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fclangir -emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -fclangir 
-emit-llvm -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror 
| FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o 
- -Wall -Werror | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Wall -Werror 
| FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o 
- -Wall -Werror | FileCheck %s -check-prefix=OGCG
+
+#include <x86intrin.h>
+
+// This test mimics clang/test/CodeGen/X86/xop-builtins.c, which eventually
+// CIR shall be able to support fully.
+
+__m128i test_mm_roti_epi8(__m128i a) {
+  // CIR-LABEL: test_mm_roti_epi8
+  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{[us]}}8i, !cir.vector<16 x 
!{{[us]}}8i> 
+  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<16 x 
!{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>, !cir.vector<16 x !{{[su]}}8i>) -> 
!cir.vector<16 x !{{[su]}}8i> 
+  // LLVM-LABEL: test_mm_roti_epi8
+  // LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <16 x i8>
+  // LLVM: {{%.*}} = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> 
%[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1))
+  // OGCG-LABEL: test_mm_roti_epi8
+  // OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <16 x i8>
+  // OGCG: {{%.*}} = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> 
%[[CASTED_VAR]], <16 x i8> %[[CASTED_VAR]], <16 x i8> splat (i8 1))
+  return _mm_roti_epi8(a, 1);
+}
+
+__m128i test_mm_roti_epi16(__m128i a) {
+  // CIR-LABEL: test_mm_roti_epi16
+  // CIR: {{%.*}} = cir.cast integral {{%.*}} : !{{[us]}}8i -> !u16i
+  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{[us]}}16i, !cir.vector<8 x 
!{{[us]}}16i> 
+  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<8 x 
!{{[su]}}16i>, !cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !{{[su]}}16i>) -> 
!cir.vector<8 x !{{[su]}}16i> 
+  // LLVM-LABEL: test_mm_roti_epi16
+  // LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <8 x i16>
+  // LLVM: {{%.*}} = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> 
%[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50))
+  // OGCG-LABEL: test_mm_roti_epi16
+  // OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <8 x i16>
+  // OGCG: {{%.*}} = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> 
%[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50))
+  return _mm_roti_epi16(a, 50);
+ }
+
+//NOTE: This only works as I expect for CIR but not for LLVMIR
+__m128i test_mm_roti_epi32(__m128i a) {
+  // CIR-LABEL: test_mm_roti_epi32
+  // CIR: {{%.*}} = cir.cast integral {{%.*}} : !{{[us]}}8i -> !u32i
+  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{[us]}}32i, !cir.vector<4 x 
!{{[us]}}32i> 
+  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<4 x 
!{{[su]}}32i>, !cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !{{[su]}}32i>) -> 
!cir.vector<4 x !{{[su]}}32i> 
+  return _mm_roti_epi32(a, -30);
+ }
+
+__m128i test_mm_roti_epi64(__m128i a) {
+  // CIR-LABEL: test_mm_roti_epi64
+  // CIR: {{%.*}} = cir.cast integral {{%.*}} : !{{[us]}}8i -> !u64i
+  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{.}}64i, !cir.vector<2 x 
!{{[us]}}64i> 
+  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<2 x 
!{{[su]}}64i>, !cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !u64i>) -> 
!cir.vector<2 x !{{[su]}}64i> 
+  // LLVM-LABEL: test_mm_roti_epi64
+  // LLVM: %[[VAR:.*]] = load <2 x i64>, ptr {{%.*}}, align 16
+  // LLVM: {{%.*}} = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x 
i64> %[[VAR]], <2 x i64> splat (i64 100))
+  // OGCG-LABEL: test_mm_roti_epi64
+  // OGCG: %[[VAR:.*]] = load <2 x i64>, ptr {{%.*}}, align 16
+  // OGCG: {{%.*}} = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x 
i64> %[[VAR]], <2 x i64> splat (i64 100))
+  return _mm_roti_epi64(a, 100);
+ }

>From 8d5e1664c3b7acb14b63a899361bd18a47decfe6 Mon Sep 17 00:00:00 2001
From: Omar Ibrahim <[email protected]>
Date: Tue, 25 Nov 2025 21:24:05 +0100
Subject: [PATCH 07/10] rebase, use auto for variable holding an explicit cast

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 5b80aea16e771..6e6f1bba07ad6 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -101,7 +101,7 @@ static mlir::Value emitX86FunnelShift(CIRGenFunction &cgf,
     auto vecTy = mlir::cast<cir::VectorType>(ty);
 
     auto numElems = vecTy.getSize();
-    cir::IntType vecElemType = 
mlir::cast<cir::IntType>(vecTy.getElementType());
+    auto vecElemType = mlir::cast<cir::IntType>(vecTy.getElementType());
     auto signlessType =
         cir::IntType::get(&cgf.getMLIRContext(), vecElemType.getWidth(), 
false);
     amt = cgf.getBuilder().createIntCast(amt, signlessType);

>From 6e71a4d399c56abc33b7404797721791bf3b4861 Mon Sep 17 00:00:00 2001
From: Omar Ibrahim <[email protected]>
Date: Tue, 25 Nov 2025 23:46:44 +0100
Subject: [PATCH 08/10] cast to unsigned properly

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    | 32 ++++++++++++-------
 .../CIR/CodeGenBuiltins/X86/xop-builtin.c     | 24 ++++++++------
 2 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 6e6f1bba07ad6..7ac389a2af715 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -92,23 +92,31 @@ static mlir::Value emitX86FunnelShift(CIRGenFunction &cgf,
                                       const mlir::Location &location,
                                       mlir::Value &op0, mlir::Value &op1,
                                       mlir::Value &amt, bool isRight) {
-  auto ty = op0.getType();
+  auto &builder = cgf.getBuilder();
+  auto op0Ty = op0.getType();
 
   // Amount may be scalar immediate, in which case create a splat vector.
   // Funnel shifts amounts are treated as modulo and types are all power-of-2
   // so we only care about the lowest log2 bits anyway.
-  if (amt.getType() != ty) {
-    auto vecTy = mlir::cast<cir::VectorType>(ty);
-
+  if (amt.getType() != op0Ty) {
+    auto vecTy = mlir::cast<cir::VectorType>(op0Ty);
     auto numElems = vecTy.getSize();
-    auto vecElemType = mlir::cast<cir::IntType>(vecTy.getElementType());
-    auto signlessType =
-        cir::IntType::get(&cgf.getMLIRContext(), vecElemType.getWidth(), 
false);
-    amt = cgf.getBuilder().createIntCast(amt, signlessType);
-
-    amt = cir::VecSplatOp::create(cgf.getBuilder(), 
cgf.getLoc(e->getExprLoc()),
-                                  cir::VectorType::get(signlessType, numElems),
-                                  amt);
+
+    auto amtTy = mlir::cast<cir::IntType>(amt.getType());
+    auto vecElemTy = mlir::cast<cir::IntType>(vecTy.getElementType());
+
+    // Cast to same width unsigned if not already unsigned.
+    if (amtTy.isSigned()) {
+      auto unsignedAmtTy = builder.getUIntNTy(amtTy.getWidth());
+      amt = builder.createIntCast(amt,
+                                  
builder.getUIntNTy(unsignedAmtTy.getWidth()));
+    }
+    // Cast the unsigned `amt` to operand element type's width unsigned.
+    auto unsingedVecElemType = builder.getUIntNTy(vecElemTy.getWidth());
+    amt = builder.createIntCast(amt, unsingedVecElemType);
+    amt = cir::VecSplatOp::create(
+        builder, cgf.getLoc(e->getExprLoc()),
+        cir::VectorType::get(unsingedVecElemType, numElems), amt);
   }
 
   const std::string intrinsicName = isRight ? "fshr" : "fshl";
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/xop-builtin.c 
b/clang/test/CIR/CodeGenBuiltins/X86/xop-builtin.c
index c8ae5eb0fd82d..cf69331f10dca 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/xop-builtin.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/xop-builtin.c
@@ -43,9 +43,9 @@ __m128i test_mm_roti_epi8(__m128i a) {
 
 __m128i test_mm_roti_epi16(__m128i a) {
   // CIR-LABEL: test_mm_roti_epi16
-  // CIR: {{%.*}} = cir.cast integral {{%.*}} : !{{[us]}}8i -> !u16i
-  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{[us]}}16i, !cir.vector<8 x 
!{{[us]}}16i> 
-  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<8 x 
!{{[su]}}16i>, !cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !{{[su]}}16i>) -> 
!cir.vector<8 x !{{[su]}}16i> 
+  // CIR: {{%.*}} = cir.cast integral {{%.*}} : !u8i -> !u16i
+  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{[us]}}16i, !cir.vector<8 x 
!u16i> 
+  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<8 x 
!{{[su]}}16i>, !cir.vector<8 x !{{[su]}}16i>, !cir.vector<8 x !u16i>) -> 
!cir.vector<8 x !{{[su]}}16i> 
   // LLVM-LABEL: test_mm_roti_epi16
   // LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <8 x i16>
   // LLVM: {{%.*}} = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> 
%[[CASTED_VAR]], <8 x i16> %[[CASTED_VAR]], <8 x i16> splat (i16 50))
@@ -58,17 +58,23 @@ __m128i test_mm_roti_epi16(__m128i a) {
 //NOTE: This only works as I expect for CIR but not for LLVMIR
 __m128i test_mm_roti_epi32(__m128i a) {
   // CIR-LABEL: test_mm_roti_epi32
-  // CIR: {{%.*}} = cir.cast integral {{%.*}} : !{{[us]}}8i -> !u32i
-  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{[us]}}32i, !cir.vector<4 x 
!{{[us]}}32i> 
-  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<4 x 
!{{[su]}}32i>, !cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !{{[su]}}32i>) -> 
!cir.vector<4 x !{{[su]}}32i> 
+  // CIR: {{%.*}} = cir.cast integral {{%.*}} : !u8i -> !u32i
+  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{[us]}}32i, !cir.vector<4 x 
!u32i> 
+  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<4 x 
!{{[su]}}32i>, !cir.vector<4 x !{{[su]}}32i>, !cir.vector<4 x !u32i>) -> 
!cir.vector<4 x !{{[su]}}32i> 
+  // LLVM-LABEL: test_mm_roti_epi32
+  // LLVM: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <4 x i32>
+  // LLVM: {{%.*}} = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> 
%[[CASTED_VAR]], <4 x i32> %[[CASTED_VAR]], <4 x i32> splat (i32 226))
+  // OGCG-LABEL: test_mm_roti_epi32
+  // OGCG: %[[CASTED_VAR:.*]] = bitcast <2 x i64> {{%.*}} to <4 x i32>
+  // OGCG: {{%.*}} = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> 
%[[CASTED_VAR]], <4 x i32> %[[CASTED_VAR]], <4 x i32> splat (i32 226))
   return _mm_roti_epi32(a, -30);
  }
 
 __m128i test_mm_roti_epi64(__m128i a) {
   // CIR-LABEL: test_mm_roti_epi64
-  // CIR: {{%.*}} = cir.cast integral {{%.*}} : !{{[us]}}8i -> !u64i
-  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !{{.}}64i, !cir.vector<2 x 
!{{[us]}}64i> 
-  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<2 x 
!{{[su]}}64i>, !cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !u64i>) -> 
!cir.vector<2 x !{{[su]}}64i> 
+  // CIR: {{%.*}} = cir.cast integral {{%.*}} : !u8i -> !u64i
+  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !u64i, !cir.vector<2 x !u64i> 
+  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshl" {{.*}} : (!cir.vector<2 x 
!{{[su]}}64i>, !cir.vector<2 x !{{[su]}}64i>, !cir.vector<2 x !u64i>) -> 
!cir.vector<2 x !s64i> 
   // LLVM-LABEL: test_mm_roti_epi64
   // LLVM: %[[VAR:.*]] = load <2 x i64>, ptr {{%.*}}, align 16
   // LLVM: {{%.*}} = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %[[VAR]], <2 x 
i64> %[[VAR]], <2 x i64> splat (i64 100))

>From f25c41e56c8290ad064954407ab161f6a44024ea Mon Sep 17 00:00:00 2001
From: Omar Ibrahim <[email protected]>
Date: Thu, 27 Nov 2025 08:57:44 +0100
Subject: [PATCH 09/10] address comments

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    | 21 +++++++++----------
 .../CodeGenBuiltins/X86/avx512f-builtins.c    | 12 +++++++++++
 shell.nix                                     | 14 +++++++++++++
 3 files changed, 36 insertions(+), 11 deletions(-)
 create mode 100644 shell.nix

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 7ac389a2af715..0cb42ef50ec8c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -92,35 +92,34 @@ static mlir::Value emitX86FunnelShift(CIRGenFunction &cgf,
                                       const mlir::Location &location,
                                       mlir::Value &op0, mlir::Value &op1,
                                       mlir::Value &amt, bool isRight) {
-  auto &builder = cgf.getBuilder();
-  auto op0Ty = op0.getType();
+  CIRGenBuilderTy &builder = cgf.getBuilder();
+  mlir::Type op0Ty = op0.getType();
 
   // Amount may be scalar immediate, in which case create a splat vector.
   // Funnel shifts amounts are treated as modulo and types are all power-of-2
   // so we only care about the lowest log2 bits anyway.
   if (amt.getType() != op0Ty) {
     auto vecTy = mlir::cast<cir::VectorType>(op0Ty);
-    auto numElems = vecTy.getSize();
+    uint64_t numElems = vecTy.getSize();
 
     auto amtTy = mlir::cast<cir::IntType>(amt.getType());
     auto vecElemTy = mlir::cast<cir::IntType>(vecTy.getElementType());
 
     // Cast to same width unsigned if not already unsigned.
     if (amtTy.isSigned()) {
-      auto unsignedAmtTy = builder.getUIntNTy(amtTy.getWidth());
-      amt = builder.createIntCast(amt,
-                                  
builder.getUIntNTy(unsignedAmtTy.getWidth()));
+      cir::IntType unsignedAmtTy = builder.getUIntNTy(amtTy.getWidth());
+      amt = builder.createIntCast(amt, unsignedAmtTy);
     }
     // Cast the unsigned `amt` to operand element type's width unsigned.
-    auto unsingedVecElemType = builder.getUIntNTy(vecElemTy.getWidth());
-    amt = builder.createIntCast(amt, unsingedVecElemType);
+    cir::IntType unsignedVecElemType = 
builder.getUIntNTy(vecElemTy.getWidth());
+    amt = builder.createIntCast(amt, unsignedVecElemType);
     amt = cir::VecSplatOp::create(
-        builder, cgf.getLoc(e->getExprLoc()),
-        cir::VectorType::get(unsingedVecElemType, numElems), amt);
+        builder, location, cir::VectorType::get(unsignedVecElemType, numElems),
+        amt);
   }
 
   const std::string intrinsicName = isRight ? "fshr" : "fshl";
-  return emitIntrinsicCallOp(cgf.getBuilder(), location, intrinsicName, ty,
+  return emitIntrinsicCallOp(cgf.getBuilder(), location, intrinsicName, op0Ty,
                              mlir::ValueRange{op0, op1, amt});
 }
 
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c 
b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c
index dc54a87856a7c..65db2ac04fad0 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c
@@ -77,3 +77,15 @@ __m512i test_mm512_undefined_epi32(void) {
   // OGCG: ret <8 x i64> zeroinitializer
   return _mm512_undefined_epi32();
 }
+
+__m512i test_mm512_ror_epi32(__m512i __A) {
+  // CHECK-LABEL: test_mm512_ror_epi32
+  // CHECK: @llvm.fshr.v16i32
+  return _mm512_ror_epi32(__A, 5); 
+}
+
+__m512i test_mm512_ror_epi64(__m512i __A) {
+  // CHECK-LABEL: test_mm512_ror_epi64
+  // CHECK: @llvm.fshr.v8i64
+  return _mm512_ror_epi64(__A, 5); 
+}
diff --git a/shell.nix b/shell.nix
new file mode 100644
index 0000000000000..c30f6dc7b6928
--- /dev/null
+++ b/shell.nix
@@ -0,0 +1,14 @@
+let
+  nixpkgs = fetchTarball 
"https://github.com/NixOS/nixpkgs/tarball/nixos-24.05";;
+  pkgs = import nixpkgs { config = {}; overlays = []; };
+in
+
+
+pkgs.mkShellNoCC {
+  packages = with pkgs; [
+    cmake
+    ninja
+    llvmPackages_latest.llvm
+  ];
+stdenv = pkgs.clangStdenv;
+}

>From ac53a369b72fcabece3d6c4037ddc8b6758bb66c Mon Sep 17 00:00:00 2001
From: Omar Ibrahim <[email protected]>
Date: Thu, 27 Nov 2025 22:51:26 +0100
Subject: [PATCH 10/10] address comments

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    |  6 ++---
 .../CodeGenBuiltins/X86/avx512f-builtins.c    | 25 ++++++++++++++++---
 clang/test/CodeGen/X86/xop-builtins.c         |  7 ++++++
 3 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 0cb42ef50ec8c..20891bd214400 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -89,9 +89,9 @@ static mlir::Value getMaskVecValue(CIRGenBuilderTy &builder, 
mlir::Location loc,
 }
 
 static mlir::Value emitX86FunnelShift(CIRGenFunction &cgf,
-                                      const mlir::Location &location,
-                                      mlir::Value &op0, mlir::Value &op1,
-                                      mlir::Value &amt, bool isRight) {
+                                      mlir::Location location, mlir::Value 
&op0,
+                                      mlir::Value &op1, mlir::Value &amt,
+                                      bool isRight) {
   CIRGenBuilderTy &builder = cgf.getBuilder();
   mlir::Type op0Ty = op0.getType();
 
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c 
b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c
index 65db2ac04fad0..039d3f9f144de 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c
@@ -79,13 +79,30 @@ __m512i test_mm512_undefined_epi32(void) {
 }
 
 __m512i test_mm512_ror_epi32(__m512i __A) {
-  // CHECK-LABEL: test_mm512_ror_epi32
-  // CHECK: @llvm.fshr.v16i32
+  // CIR-LABEL: test_mm512_ror_epi32
+  // CIR: {{%.*}} =  cir.cast integral {{%.*}} : !s32i -> !u32i
+  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !u32i, !cir.vector<16 x !u32i>
+  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshr" {{%.*}}: (!cir.vector<16 x 
!s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x 
!s32i> 
+  // LLVM-LABEL: test_mm512_ror_epi32
+  // LLVM: %[[CASTED_VAR:.*]] = bitcast <8 x i64> {{%.*}} to <16 x i32>
+  // LLVM: {{%.*}} = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> 
%[[CASTED_VAR]], <16 x i32> %[[CASTED_VAR]], <16 x i32> splat (i32 5))
+  // OGCG-LABEL: test_mm512_ror_epi32
+  // OGCG: %[[CASTED_VAR:.*]] = bitcast <8 x i64> {{%.*}} to <16 x i32>
+  // OGCG: {{%.*}} = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> 
%[[CASTED_VAR]], <16 x i32> %[[CASTED_VAR]], <16 x i32> splat (i32 5))
   return _mm512_ror_epi32(__A, 5); 
 }
 
 __m512i test_mm512_ror_epi64(__m512i __A) {
-  // CHECK-LABEL: test_mm512_ror_epi64
-  // CHECK: @llvm.fshr.v8i64
+  // CIR-LABEL: test_mm512_ror_epi64
+  // CIR: {{%.*}} =  cir.cast integral {{%.*}} : !s32i -> !u32i
+  // CIR: {{%.*}} =  cir.cast integral {{%.*}} : !u32i -> !u64i
+  // CIR: {{%.*}} = cir.vec.splat {{%.*}} : !u64i, !cir.vector<8 x !u64i>
+  // CIR: {{%.*}} = cir.call_llvm_intrinsic "fshr" {{%.*}}: (!cir.vector<8 x 
!s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x 
!s64i> 
+  // LLVM-LABEL: test_mm512_ror_epi64
+  // LLVM: %[[VAR:.*]] = load <8 x i64>, ptr {{%.*}}, align 64
+  // LLVM: {{%.*}} = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %[[VAR]], <8 x 
i64> %[[VAR]], <8 x i64> splat (i64 5))
+  // OGCG-LABEL: test_mm512_ror_epi64
+  // OGCG: %[[VAR:.*]] = load <8 x i64>, ptr {{%.*}}, align 64
+  // OGCG: {{%.*}} = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %[[VAR]], <8 x 
i64> %[[VAR]], <8 x i64> splat (i64 5))
   return _mm512_ror_epi64(__A, 5); 
 }
diff --git a/clang/test/CodeGen/X86/xop-builtins.c 
b/clang/test/CodeGen/X86/xop-builtins.c
index a3cff2c89da1f..40e49e1fd1bb0 100644
--- a/clang/test/CodeGen/X86/xop-builtins.c
+++ b/clang/test/CodeGen/X86/xop-builtins.c
@@ -429,3 +429,10 @@ __m256d test_mm256_frcz_pd(__m256d a) {
   // CHECK: call {{.*}}<4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double> 
%{{.*}})
   return _mm256_frcz_pd(a);
 }
+
+
+__m512i test_mm512_ror_epi32(__m512i __A) {
+  // CHECK-LABEL: test_mm512_ror_epi32
+  // CHECK: @llvm.fshr.v16i32
+  return _mm512_ror_epi32(__A, 5); 
+}

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [CIR] Support x86 builtin rotate (PR #169566)

Reply via email to