[clang] [CIR][X86] Implement lowering for AVX512 mask builtins (PR #169185)

via cfe-commits Sat, 22 Nov 2025 11:35:27 -0800

https://github.com/GeneraluseAI updated 
https://github.com/llvm/llvm-project/pull/169185


>From dc81e321c7062d0b3019706a8692e78cf2842e0e Mon Sep 17 00:00:00 2001
From: generaluseai <[email protected]>
Date: Sun, 23 Nov 2025 03:12:00 +0800
Subject: [PATCH] [CIR][X86] Implement lowering for AVX512 mask builtins (kadd,
 kand, kandn, kor, kxor, knot, kmov)

This patch adds CIR codegen support for AVX512 mask operations on X86,
including kadd, kand, kandn, kor, kxor, knot, and kmov in all supported
mask widths. Each builtin now lowers to the expected vector<i1> form
and bitcast representations in CIR, matching the semantics of the
corresponding LLVM intrinsics.

The patch also adds comprehensive CIR/LLVM/OGCG tests for AVX512F,
AVX512DQ, and AVX512BW to validate the lowering behavior.
---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    |  93 +++++-
 .../test/CIR/CodeGen/X86/avx512bw-builtins.c  | 308 ++++++++++++++++++
 .../test/CIR/CodeGen/X86/avx512dq-builtins.c  | 213 ++++++++++++
 clang/test/CIR/CodeGen/X86/avx512f-builtins.c | 284 +++++++++++-----
 4 files changed, 816 insertions(+), 82 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/X86/avx512bw-builtins.c
 create mode 100644 clang/test/CIR/CodeGen/X86/avx512dq-builtins.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 978fee7dbec9d..0af8c75aa3419 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -68,6 +68,45 @@ static mlir::Value emitVectorFCmp(CIRGenBuilderTy &builder,
   return bitCast;
 }
 
+// Convert the mask from an integer type to a vector of i1.
+static mlir::Value getMaskVecValue(CIRGenFunction &cgf, const CallExpr *expr,
+                                   mlir::Value mask, unsigned numElems) {
+  auto &builder = cgf.getBuilder();
+
+  cir::VectorType maskTy =
+      cir::VectorType::get(cgf.getBuilder().getSIntNTy(1),
+                           cast<cir::IntType>(mask.getType()).getWidth());
+  mlir::Value maskVec = builder.createBitcast(mask, maskTy);
+
+  // If we have less than 8 elements, then the starting mask was an i8 and
+  // we need to extract down to the right number of elements.
+  if (numElems < 8) {
+    SmallVector<mlir::Attribute, 4> indices;
+    mlir::Type i32Ty = builder.getI32Type();
+    for (auto i : llvm::seq<unsigned>(0, numElems))
+      indices.push_back(cir::IntAttr::get(i32Ty, i));
+    maskVec = builder.createVecShuffle(cgf.getLoc(expr->getExprLoc()), maskVec,
+                                       maskVec, indices);
+  }
+  return maskVec;
+}
+
+static mlir::Value emitX86MaskLogic(CIRGenFunction &cgf, const CallExpr *expr,
+                                    cir::BinOpKind opc,
+                                    SmallVectorImpl<mlir::Value> &ops,
+                                    bool InvertLHS = false) {
+  CIRGenBuilderTy &builder = cgf.getBuilder();
+  unsigned numElts = cast<cir::IntType>(ops[0].getType()).getWidth();
+  mlir::Value LHS = getMaskVecValue(cgf, expr, ops[0], numElts);
+  mlir::Value RHS = getMaskVecValue(cgf, expr, ops[1], numElts);
+
+  if (InvertLHS)
+    LHS = builder.createNot(LHS);
+  return builder.createBitcast(
+      builder.createBinop(cgf.getLoc(expr->getExprLoc()), LHS, opc, RHS),
+      ops[0].getType());
+}
+
 mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
                                                const CallExpr *expr) {
   if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -675,38 +714,86 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_ktestzsi:
   case X86::BI__builtin_ia32_ktestcdi:
   case X86::BI__builtin_ia32_ktestzdi:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_kaddqi:
   case X86::BI__builtin_ia32_kaddhi:
   case X86::BI__builtin_ia32_kaddsi:
-  case X86::BI__builtin_ia32_kadddi:
+  case X86::BI__builtin_ia32_kadddi: {
+    std::string intrinsicName;
+    switch (builtinID) {
+    default:
+      llvm_unreachable("Unsupported intrinsic!");
+    case X86::BI__builtin_ia32_kaddqi:
+      intrinsicName = "x86.avx512.kadd.b";
+      break;
+    case X86::BI__builtin_ia32_kaddhi:
+      intrinsicName = "x86.avx512.kadd.w";
+      break;
+    case X86::BI__builtin_ia32_kaddsi:
+      intrinsicName = "x86.avx512.kadd.d";
+      break;
+    case X86::BI__builtin_ia32_kadddi:
+      intrinsicName = "x86.avx512.kadd.q";
+      break;
+    }
+    auto intTy = cast<cir::IntType>(ops[0].getType());
+    unsigned numElts = intTy.getWidth();
+    mlir::Value lhsVec = getMaskVecValue(*this, expr, ops[0], numElts);
+    mlir::Value rhsVec = getMaskVecValue(*this, expr, ops[1], numElts);
+    mlir::Type vecTy = lhsVec.getType();
+    mlir::Value resVec = emitIntrinsicCallOp(*this, expr, intrinsicName, vecTy,
+                                             mlir::ValueRange{lhsVec, rhsVec});
+    return builder.createBitcast(resVec, ops[0].getType());
+  }
   case X86::BI__builtin_ia32_kandqi:
   case X86::BI__builtin_ia32_kandhi:
   case X86::BI__builtin_ia32_kandsi:
   case X86::BI__builtin_ia32_kanddi:
+    return emitX86MaskLogic(*this, expr, cir::BinOpKind::And, ops);
   case X86::BI__builtin_ia32_kandnqi:
   case X86::BI__builtin_ia32_kandnhi:
   case X86::BI__builtin_ia32_kandnsi:
   case X86::BI__builtin_ia32_kandndi:
+    return emitX86MaskLogic(*this, expr, cir::BinOpKind::And, ops, true);
   case X86::BI__builtin_ia32_korqi:
   case X86::BI__builtin_ia32_korhi:
   case X86::BI__builtin_ia32_korsi:
   case X86::BI__builtin_ia32_kordi:
+    return emitX86MaskLogic(*this, expr, cir::BinOpKind::Or, ops);
   case X86::BI__builtin_ia32_kxnorqi:
   case X86::BI__builtin_ia32_kxnorhi:
   case X86::BI__builtin_ia32_kxnorsi:
   case X86::BI__builtin_ia32_kxnordi:
+    return emitX86MaskLogic(*this, expr, cir::BinOpKind::Xor, ops, true);
   case X86::BI__builtin_ia32_kxorqi:
   case X86::BI__builtin_ia32_kxorhi:
   case X86::BI__builtin_ia32_kxorsi:
   case X86::BI__builtin_ia32_kxordi:
+    return emitX86MaskLogic(*this, expr, cir::BinOpKind::Xor, ops);
   case X86::BI__builtin_ia32_knotqi:
   case X86::BI__builtin_ia32_knothi:
   case X86::BI__builtin_ia32_knotsi:
-  case X86::BI__builtin_ia32_knotdi:
+  case X86::BI__builtin_ia32_knotdi: {
+    cir::IntType intTy = cast<cir::IntType>(ops[0].getType());
+    unsigned numElts = intTy.getWidth();
+    mlir::Value resVec = getMaskVecValue(*this, expr, ops[0], numElts);
+    return builder.createBitcast(builder.createNot(resVec), ops[0].getType());
+  }
   case X86::BI__builtin_ia32_kmovb:
   case X86::BI__builtin_ia32_kmovw:
   case X86::BI__builtin_ia32_kmovd:
-  case X86::BI__builtin_ia32_kmovq:
+  case X86::BI__builtin_ia32_kmovq: {
+    // Bitcast to vXi1 type and then back to integer. This gets the mask
+    // register type into the IR, but might be optimized out depending on
+    // what's around it.
+    cir::IntType intTy = cast<cir::IntType>(ops[0].getType());
+    unsigned numElts = intTy.getWidth();
+    mlir::Value resVec = getMaskVecValue(*this, expr, ops[0], numElts);
+    return builder.createBitcast(resVec, ops[0].getType());
+  }
   case X86::BI__builtin_ia32_kunpckdi:
   case X86::BI__builtin_ia32_kunpcksi:
   case X86::BI__builtin_ia32_kunpckhi:
diff --git a/clang/test/CIR/CodeGen/X86/avx512bw-builtins.c 
b/clang/test/CIR/CodeGen/X86/avx512bw-builtins.c
new file mode 100644
index 0000000000000..0d5aa2918e922
--- /dev/null
+++ b/clang/test/CIR/CodeGen/X86/avx512bw-builtins.c
@@ -0,0 +1,308 @@
+ // RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512bw -fclangir -emit-cir -o 
%t.cir -Wall -Werror
+ // RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+ // RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512bw -fclangir -emit-llvm -o 
%t.ll -Wall -Werror
+ // RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+ // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512bw -fno-signed-char 
-fclangir -emit-cir -o %t.cir -Wall -Werror
+ // RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+ // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512bw -fno-signed-char 
-fclangir -emit-llvm -o %t.ll -Wall -Werror
+ // RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+ // RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512bw -emit-llvm -o - -Wall 
-Werror | FileCheck %s -check-prefix=OGCG
+ // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512bw -emit-llvm -o - -Wall 
-Werror | FileCheck %s -check-prefix=OGCG
+
+ #include <immintrin.h>
+
+__mmask32 test_kadd_mask32(__mmask32 A, __mmask32 B) {
+  // CIR-LABEL: _kadd_mask32
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.call_llvm_intrinsic "x86.avx512.kadd.d"
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<32 x !cir.int<s, 1>> -> !u32i
+
+  // LLVM-LABEL: _kadd_mask32
+  // LLVM: [[L:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: [[R:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: [[RES:%.*]] = call <32 x i1> @llvm.x86.avx512.kadd.d(<32 x i1> 
[[L]], <32 x i1> [[R]])
+  // LLVM: bitcast <32 x i1> [[RES]] to i32
+
+  // OGCG-LABEL: _kadd_mask32
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: call <32 x i1> @llvm.x86.avx512.kadd.d
+  // OGCG: bitcast <32 x i1> {{.*}} to i32
+  return _kadd_mask32(A, B);
+}
+
+__mmask64 test_kadd_mask64(__mmask64 A, __mmask64 B) {
+  // CIR-LABEL: _kadd_mask64
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.call_llvm_intrinsic "x86.avx512.kadd.q"
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<64 x !cir.int<s, 1>> -> !u64i
+
+  // LLVM-LABEL: _kadd_mask64
+  // LLVM: [[L:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: [[R:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: [[RES:%.*]] = call <64 x i1> @llvm.x86.avx512.kadd.q(<64 x i1> 
[[L]], <64 x i1> [[R]])
+  // LLVM: bitcast <64 x i1> [[RES]] to i64
+
+  // OGCG-LABEL: _kadd_mask64
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: call <64 x i1> @llvm.x86.avx512.kadd.q
+  // OGCG: bitcast <64 x i1> {{.*}} to i64
+  return _kadd_mask64(A, B);
+}
+
+__mmask32 test_kand_mask32(__mmask32 A, __mmask32 B) {
+  // CIR-LABEL: _kand_mask32
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.binop(and, {{.*}}, {{.*}}) : !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<32 x !cir.int<s, 1>> -> !u32i
+
+  // LLVM-LABEL: _kand_mask32
+  // LLVM: [[L:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: [[R:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: [[RES:%.*]] = and <32 x i1> [[L]], [[R]]
+  // LLVM: bitcast <32 x i1> [[RES]] to i32
+
+  // OGCG-LABEL: _kand_mask32
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: and <32 x i1>
+  // OGCG: bitcast <32 x i1> {{.*}} to i32
+  return _kand_mask32(A, B);
+}
+
+__mmask64 test_kand_mask64(__mmask64 A, __mmask64 B) {
+  // CIR-LABEL: _kand_mask64
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.binop(and, {{.*}}, {{.*}}) : !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<64 x !cir.int<s, 1>> -> !u64i
+
+  // LLVM-LABEL: _kand_mask64
+  // LLVM: [[L:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: [[R:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: [[RES:%.*]] = and <64 x i1> [[L]], [[R]]
+  // LLVM: bitcast <64 x i1> [[RES]] to i64
+
+  // OGCG-LABEL: _kand_mask64
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: and <64 x i1>
+  // OGCG: bitcast <64 x i1> {{.*}} to i64
+  return _kand_mask64(A, B);
+}
+
+__mmask32 test_kandn_mask32(__mmask32 A, __mmask32 B) {
+  // CIR-LABEL: _kandn_mask32
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.unary(not, {{.*}}) : !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.binop(and, {{.*}}, {{.*}}) : !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<32 x !cir.int<s, 1>> -> !u32i
+
+  // LLVM-LABEL: _kandn_mask32
+  // LLVM: [[L:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: [[R:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: xor <32 x i1> [[L]], {{.*}}
+  // LLVM: and <32 x i1>
+  // LLVM: bitcast <32 x i1> {{.*}} to i32
+
+  // OGCG-LABEL: _kandn_mask32
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: xor <32 x i1>
+  // OGCG: and <32 x i1>
+  // OGCG: bitcast <32 x i1> {{.*}} to i32
+  return _kandn_mask32(A, B);
+}
+
+__mmask64 test_kandn_mask64(__mmask64 A, __mmask64 B) {
+  // CIR-LABEL: _kandn_mask64
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.unary(not, {{.*}}) : !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.binop(and, {{.*}}, {{.*}}) : !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<64 x !cir.int<s, 1>> -> !u64i
+
+  // LLVM-LABEL: _kandn_mask64
+  // LLVM: [[L:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: [[R:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: xor <64 x i1> [[L]], {{.*}}
+  // LLVM: and <64 x i1>
+  // LLVM: bitcast <64 x i1> {{.*}} to i64
+
+  // OGCG-LABEL: _kandn_mask64
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: xor <64 x i1>
+  // OGCG: and <64 x i1>
+  // OGCG: bitcast <64 x i1> {{.*}} to i64
+  return _kandn_mask64(A, B);
+}
+
+__mmask32 test_kor_mask32(__mmask32 A, __mmask32 B) {
+  // CIR-LABEL: _kor_mask32
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.binop(or, {{.*}}, {{.*}}) : !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<32 x !cir.int<s, 1>> -> !u32i
+
+  // LLVM-LABEL: _kor_mask32
+  // LLVM: [[L:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: [[R:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: or <32 x i1> [[L]], [[R]]
+  // LLVM: bitcast <32 x i1> {{.*}} to i32
+
+  // OGCG-LABEL: _kor_mask32
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: or <32 x i1>
+  // OGCG: bitcast <32 x i1> {{.*}} to i32
+  return _kor_mask32(A, B);
+}
+
+__mmask64 test_kor_mask64(__mmask64 A, __mmask64 B) {
+  // CIR-LABEL: _kor_mask64
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.binop(or, {{.*}}, {{.*}}) : !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<64 x !cir.int<s, 1>> -> !u64i
+
+  // LLVM-LABEL: _kor_mask64
+  // LLVM: [[L:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: [[R:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: or <64 x i1> [[L]], [[R]]
+  // LLVM: bitcast <64 x i1> {{.*}} to i64
+
+  // OGCG-LABEL: _kor_mask64
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: or <64 x i1>
+  // OGCG: bitcast <64 x i1> {{.*}} to i64
+  return _kor_mask64(A, B);
+}
+
+__mmask32 test_kxor_mask32(__mmask32 A, __mmask32 B) {
+  // CIR-LABEL: _kxor_mask32
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.binop(xor, {{.*}}, {{.*}}) : !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<32 x !cir.int<s, 1>> -> !u32i
+
+  // LLVM-LABEL: _kxor_mask32
+  // LLVM: [[L:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: [[R:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: xor <32 x i1> [[L]], [[R]]
+  // LLVM: bitcast <32 x i1> {{.*}} to i32
+
+  // OGCG-LABEL: _kxor_mask32
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: xor <32 x i1>
+  // OGCG: bitcast <32 x i1> {{.*}} to i32
+  return _kxor_mask32(A, B);
+}
+
+__mmask64 test_kxor_mask64(__mmask64 A, __mmask64 B) {
+  // CIR-LABEL: _kxor_mask64
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.binop(xor, {{.*}}, {{.*}}) : !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<64 x !cir.int<s, 1>> -> !u64i
+
+  // LLVM-LABEL: _kxor_mask64
+  // LLVM: [[L:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: [[R:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: xor <64 x i1> [[L]], [[R]]
+  // LLVM: bitcast <64 x i1> {{.*}} to i64
+
+  // OGCG-LABEL: _kxor_mask64
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: xor <64 x i1>
+  // OGCG: bitcast <64 x i1> {{.*}} to i64
+  return _kxor_mask64(A, B);
+}
+
+__mmask32 test_knot_mask32(__mmask32 A) {
+  // CIR-LABEL: _knot_mask32
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.unary(not, {{.*}}) : !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<32 x !cir.int<s, 1>> -> !u32i
+
+  // LLVM-LABEL: _knot_mask32
+  // LLVM: bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: xor <32 x i1>
+  // LLVM: bitcast <32 x i1> {{.*}} to i32
+
+  // OGCG-LABEL: _knot_mask32
+  // OGCG: xor <32 x i1>
+
+  return _knot_mask32(A);
+}
+
+__mmask64 test_knot_mask64(__mmask64 A) {
+  // CIR-LABEL: _knot_mask64
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.unary(not, {{.*}}) : !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<64 x !cir.int<s, 1>> -> !u64i
+
+  // LLVM-LABEL: _knot_mask64
+  // LLVM: bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: xor <64 x i1>
+  // LLVM: bitcast <64 x i1> {{.*}} to i64
+
+  // OGCG-LABEL: _knot_mask64
+  // OGCG: xor <64 x i1>
+
+  return _knot_mask64(A);
+}
+
+// Multiple user-level mask helpers inline to this same kmov builtin.
+// CIR does not implement any special lowering for those helpers.
+//
+// Therefore, testing the builtin (__builtin_ia32_kmov*) directly is
+// sufficient to cover the CIR lowering behavior. Testing each helper
+// individually would add no new CIR paths.
+
+__mmask32 test_kmov_d(__mmask32 A) {
+  // CIR-LABEL: test_kmov_d
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<32 x !cir.int<s, 1>> -> !u32i
+
+  // LLVM-LABEL: test_kmov_d
+  // LLVM: bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: bitcast <32 x i1> {{.*}} to i32
+
+  // OGCG-LABEL: test_kmov_d
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: bitcast <32 x i1> {{.*}} to i32
+
+  return __builtin_ia32_kmovd(A);
+}
+
+// Multiple user-level mask helpers inline to this same kmov builtin.
+// CIR does not implement any special lowering for those helpers.
+//
+// Therefore, testing the builtin (__builtin_ia32_kmov*) directly is
+// sufficient to cover the CIR lowering behavior. Testing each helper
+// individually would add no new CIR paths.
+
+__mmask64 test_kmov_q(__mmask64 A) {
+  // CIR-LABEL: test_kmov_q
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<64 x !cir.int<s, 1>> -> !u64i
+
+  // LLVM-LABEL: test_kmov_q
+  // LLVM: bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: bitcast <64 x i1> {{.*}} to i64
+
+  // OGCG-LABEL: test_kmov_q
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: bitcast <64 x i1> {{.*}} to i64
+
+  return __builtin_ia32_kmovq(A);
+}
diff --git a/clang/test/CIR/CodeGen/X86/avx512dq-builtins.c 
b/clang/test/CIR/CodeGen/X86/avx512dq-builtins.c
new file mode 100644
index 0000000000000..21c255e28f3f3
--- /dev/null
+++ b/clang/test/CIR/CodeGen/X86/avx512dq-builtins.c
@@ -0,0 +1,213 @@
+ // RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512dq -fclangir -emit-cir -o 
%t.cir -Wall -Werror
+ // RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+ // RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512dq -fclangir -emit-llvm -o 
%t.ll -Wall -Werror
+ // RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+ // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512dq -fno-signed-char 
-fclangir -emit-cir -o %t.cir -Wall -Werror
+ // RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+ // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512dq -fno-signed-char 
-fclangir -emit-llvm -o %t.ll -Wall -Werror
+ // RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+ // RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512dq -emit-llvm -o - -Wall 
-Werror | FileCheck %s -check-prefix=OGCG
+ // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512dq -emit-llvm -o - -Wall 
-Werror | FileCheck %s -check-prefix=OGCG
+
+ #include <immintrin.h>
+
+__mmask8 test_kadd_mask8(__mmask8 A, __mmask8 B) {
+  // CIR-LABEL: _kadd_mask8
+  // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.call_llvm_intrinsic "x86.avx512.kadd.b"
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<8 x !cir.int<s, 1>> -> !u8i
+
+  // LLVM-LABEL: _kadd_mask8
+  // LLVM: [[L:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: [[R:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: [[RES:%.*]] = call <8 x i1> @llvm.x86.avx512.kadd.b(<8 x i1> [[L]], 
<8 x i1> [[R]])
+  // LLVM: bitcast <8 x i1> [[RES]] to i8
+
+  // OGCG-LABEL: _kadd_mask8
+  // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: call <8 x i1> @llvm.x86.avx512.kadd.b
+  // OGCG: bitcast <8 x i1> {{.*}} to i8
+  return _kadd_mask8(A, B);
+
+}
+
+__mmask16 test_kadd_mask16(__mmask16 A, __mmask16 B) {
+  // CIR-LABEL: _kadd_mask16
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.call_llvm_intrinsic "x86.avx512.kadd.w"
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<16 x !cir.int<s, 1>> -> !u16i
+
+  // LLVM-LABEL: _kadd_mask16
+  // LLVM: [[L:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: [[R:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: [[RES:%.*]] = call <16 x i1> @llvm.x86.avx512.kadd.w(<16 x i1> 
[[L]], <16 x i1> [[R]])
+  // LLVM: bitcast <16 x i1> [[RES]] to i16
+
+  // OGCG-LABEL: _kadd_mask16
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: call <16 x i1> @llvm.x86.avx512.kadd.w
+  // OGCG: bitcast <16 x i1> {{.*}} to i16
+  return _kadd_mask16(A, B);
+}
+
+__mmask8 test_kand_mask8(__mmask8 A, __mmask8 B) {
+  // CIR-LABEL: _kand_mask8
+  // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.binop(and, {{.*}}, {{.*}}) : !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<8 x !cir.int<s, 1>> -> !u8i
+
+  // LLVM-LABEL: _kand_mask8
+  // LLVM: [[L:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: [[R:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: [[RES:%.*]] = and <8 x i1> [[L]], [[R]]
+  // LLVM: bitcast <8 x i1> [[RES]] to i8
+
+  // OGCG-LABEL: _kand_mask8
+  // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: and <8 x i1>
+  // OGCG: bitcast <8 x i1> {{.*}} to i8
+  return _kand_mask8(A, B);
+}
+
+
+__mmask8 test_kandn_mask8(__mmask8 A, __mmask8 B) {
+  // CIR-LABEL: _kandn_mask8
+  // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.unary(not, {{.*}}) : !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.binop(and, {{.*}}, {{.*}}) : !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<8 x !cir.int<s, 1>> -> !u8i
+
+  // LLVM-LABEL: _kandn_mask8
+  // LLVM: [[L:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: [[R:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: xor <8 x i1> [[L]], {{.*}}
+  // LLVM: and <8 x i1>
+  // LLVM: bitcast <8 x i1> {{.*}} to i8
+
+  // OGCG-LABEL: _kandn_mask8
+  // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: xor <8 x i1>
+  // OGCG: and <8 x i1>
+  // OGCG: bitcast <8 x i1> {{.*}} to i8
+
+  return _kandn_mask8(A, B);
+}
+
+__mmask8 test_kor_mask8(__mmask8 A, __mmask8 B) {
+  // CIR-LABEL: _kor_mask8
+  // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.binop(or, {{.*}}, {{.*}}) : !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<8 x !cir.int<s, 1>> -> !u8i
+
+  // LLVM-LABEL: _kor_mask8
+  // LLVM: [[L:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: [[R:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: or <8 x i1> [[L]], [[R]]
+  // LLVM: bitcast <8 x i1> {{.*}} to i8
+
+  // OGCG-LABEL: _kor_mask8
+  // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: or <8 x i1>
+  // OGCG: bitcast <8 x i1> {{.*}} to i8
+
+  return _kor_mask8(A, B);
+}
+
+__mmask8 test_kxor_mask8(__mmask8 A, __mmask8 B) {
+  // CIR-LABEL: _kxor_mask8
+  // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.binop(xor, {{.*}}, {{.*}}) : !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<8 x !cir.int<s, 1>> -> !u8i
+
+  // LLVM-LABEL: _kxor_mask8
+  // LLVM: [[L:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: [[R:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: xor <8 x i1> [[L]], [[R]]
+  // LLVM: bitcast <8 x i1> {{.*}} to i8
+
+  // OGCG-LABEL: _kxor_mask8
+  // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: xor <8 x i1>
+  // OGCG: bitcast <8 x i1> {{.*}} to i8
+
+  return _kxor_mask8(A, B);
+}
+
+__mmask8 test_kxnor_mask8(__mmask8 A, __mmask8 B) {
+  // CIR-LABEL: _kxnor_mask8
+  // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.unary(not, {{.*}}) : !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.binop(xor, {{.*}}, {{.*}}) : !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<8 x !cir.int<s, 1>> -> !u8i
+
+  // LLVM-LABEL: _kxnor_mask8
+  // LLVM: [[L:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: [[R:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: xor <8 x i1> [[L]], {{.*}}
+  // LLVM: xor <8 x i1> {{.*}}, [[R]]
+
+  // OGCG-LABEL: _kxnor_mask8
+  // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: xor <8 x i1>
+  // OGCG: xor <8 x i1>
+  // OGCG: bitcast <8 x i1> {{.*}} to i8
+
+  return _kxnor_mask8(A, B);
+}
+
+
+__mmask8 test_knot_mask8(__mmask8 A) {
+  // CIR-LABEL: _knot_mask8
+  // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.unary(not, {{.*}}) : !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<8 x !cir.int<s, 1>> -> !u8i
+
+  // LLVM-LABEL: _knot_mask8
+  // LLVM: [[L:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: xor <8 x i1> [[L]], {{.*}}
+  // LLVM: bitcast <8 x i1> {{.*}} to i8
+
+  // OGCG-LABEL: _knot_mask8
+  // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: xor <8 x i1>
+  // OGCG: bitcast <8 x i1> {{.*}} to i8
+
+  return _knot_mask8(A);
+}
+
+// Multiple user-level mask helpers inline to this same kmov builtin.
+// CIR does not implement any special lowering for those helpers.
+//
+// Therefore, testing the builtin (__builtin_ia32_kmov*) directly is
+// sufficient to cover the CIR lowering behavior. Testing each helper
+// individually would add no new CIR paths.
+
+__mmask8 test_kmov_b(__mmask8 A) {
+  // CIR-LABEL: test_kmov_b
+  // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<8 x !cir.int<s, 1>> -> !u8i
+
+  // LLVM-LABEL: test_kmov_b
+  // LLVM: bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: bitcast <8 x i1> {{.*}} to i8
+
+  // OGCG-LABEL: test_kmov_b
+  // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: bitcast <8 x i1> {{.*}} to i8
+
+  return __builtin_ia32_kmovb(A);
+}
\ No newline at end of file
diff --git a/clang/test/CIR/CodeGen/X86/avx512f-builtins.c 
b/clang/test/CIR/CodeGen/X86/avx512f-builtins.c
index dc54a87856a7c..646e36215d0d8 100644
--- a/clang/test/CIR/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CIR/CodeGen/X86/avx512f-builtins.c
@@ -1,79 +1,205 @@
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512f -fclangir -emit-cir -o 
%t.cir -Wall -Werror -Wsign-conversion
-// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512f -fclangir -emit-llvm -o 
%t.ll -Wall -Werror -Wsign-conversion
-// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
-
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512f -fclangir -emit-cir -o 
%t.cir -Wall -Werror -Wsign-conversion
-// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512f -fclangir -emit-llvm -o 
%t.ll -Wall -Werror -Wsign-conversion
-// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
-
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +avx512f -emit-llvm -o - -Wall 
-Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
-// RUN: %clang_cc1 -x c -flax-vector-conversions=none -fms-extensions 
-fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc 
-target-feature +avx512f -emit-llvm -o - -Wall -Werror -Wsign-conversion | 
FileCheck %s --check-prefixes=OGCG
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +avx512f -emit-llvm -o - -Wall 
-Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
-// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -fms-extensions 
-fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc 
-target-feature +avx512f -emit-llvm -o - -Wall -Werror -Wsign-conversion | 
FileCheck %s --check-prefixes=OGCG
-
-#include <immintrin.h>
-
-__m512 test_mm512_undefined(void) {
-  // CIR-LABEL: _mm512_undefined
-  // CIR: %[[A:.*]] = cir.const #cir.zero : !cir.vector<8 x !cir.double>
-  // CIR: %{{.*}} = cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> 
!cir.vector<16 x !cir.float>
-  // CIR: cir.return %{{.*}} : !cir.vector<16 x !cir.float>
-
-  // LLVM-LABEL: test_mm512_undefined
-  // LLVM: store <16 x float> zeroinitializer, ptr %[[A:.*]], align 64
-  // LLVM: %{{.*}} = load <16 x float>, ptr %[[A]], align 64
-  // LLVM: ret <16 x float> %{{.*}}
-
-  // OGCG-LABEL: test_mm512_undefined
-  // OGCG: ret <16 x float> zeroinitializer
-  return _mm512_undefined();
-}
-
-__m512 test_mm512_undefined_ps(void) {
-  // CIR-LABEL: _mm512_undefined_ps
-  // CIR: %[[A:.*]] = cir.const #cir.zero : !cir.vector<8 x !cir.double>
-  // CIR: %{{.*}} = cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> 
!cir.vector<16 x !cir.float>
-  // CIR: cir.return %{{.*}} : !cir.vector<16 x !cir.float>
-
-  // LLVM-LABEL: test_mm512_undefined_ps
-  // LLVM: store <16 x float> zeroinitializer, ptr %[[A:.*]], align 64
-  // LLVM: %{{.*}} = load <16 x float>, ptr %[[A]], align 64
-  // LLVM: ret <16 x float> %{{.*}}
-
-  // OGCG-LABEL: test_mm512_undefined_ps
-  // OGCG: ret <16 x float> zeroinitializer
-  return _mm512_undefined_ps();
-}
-
-__m512d test_mm512_undefined_pd(void) {
-  // CIR-LABEL: _mm512_undefined_pd
-  // CIR: %{{.*}} = cir.const #cir.zero : !cir.vector<8 x !cir.double>
-  // CIR: cir.return %{{.*}} : !cir.vector<8 x !cir.double>
-
-  // LLVM-LABEL: test_mm512_undefined_pd
-  // LLVM: store <8 x double> zeroinitializer, ptr %[[A:.*]], align 64
-  // LLVM: %{{.*}} = load <8 x double>, ptr %[[A]], align 64
-  // LLVM: ret <8 x double> %{{.*}}
-
-  // OGCG-LABEL: test_mm512_undefined_pd
-  // OGCG: ret <8 x double> zeroinitializer
-  return _mm512_undefined_pd();
-}
-
-__m512i test_mm512_undefined_epi32(void) {
-  // CIR-LABEL: _mm512_undefined_epi32
-  // CIR: %[[A:.*]] = cir.const #cir.zero : !cir.vector<8 x !cir.double>
-  // CIR: %{{.*}} = cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> 
!cir.vector<8 x !s64i>
-  // CIR: cir.return %{{.*}} : !cir.vector<8 x !s64i>
-
-  // LLVM-LABEL: test_mm512_undefined_epi32
-  // LLVM: store <8 x i64> zeroinitializer, ptr %[[A:.*]], align 64
-  // LLVM: %{{.*}} = load <8 x i64>, ptr %[[A]], align 64
-  // LLVM: ret <8 x i64> %{{.*}}
-
-  // OGCG-LABEL: test_mm512_undefined_epi32
-  // OGCG: ret <8 x i64> zeroinitializer
-  return _mm512_undefined_epi32();
-}
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512f -fclangir -emit-cir -o 
%t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512f -fclangir -emit-llvm -o 
%t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512f -fclangir -emit-cir -o 
%t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512f -fclangir -emit-llvm -o 
%t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +avx512f -emit-llvm -o - -Wall 
-Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -fms-extensions 
-fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc 
-target-feature +avx512f -emit-llvm -o - -Wall -Werror -Wsign-conversion | 
FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-apple-darwin -target-feature +avx512f -emit-llvm -o - -Wall 
-Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -fms-extensions 
-fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc 
-target-feature +avx512f -emit-llvm -o - -Wall -Werror -Wsign-conversion | 
FileCheck %s --check-prefixes=OGCG
+
+#include <immintrin.h>
+
+__m512 test_mm512_undefined(void) {
+  // CIR-LABEL: _mm512_undefined
+  // CIR: %[[A:.*]] = cir.const #cir.zero : !cir.vector<8 x !cir.double>
+  // CIR: %{{.*}} = cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> 
!cir.vector<16 x !cir.float>
+  // CIR: cir.return %{{.*}} : !cir.vector<16 x !cir.float>
+
+  // LLVM-LABEL: test_mm512_undefined
+  // LLVM: store <16 x float> zeroinitializer, ptr %[[A:.*]], align 64
+  // LLVM: %{{.*}} = load <16 x float>, ptr %[[A]], align 64
+  // LLVM: ret <16 x float> %{{.*}}
+
+  // OGCG-LABEL: test_mm512_undefined
+  // OGCG: ret <16 x float> zeroinitializer
+  return _mm512_undefined();
+}
+
+__m512 test_mm512_undefined_ps(void) {
+  // CIR-LABEL: _mm512_undefined_ps
+  // CIR: %[[A:.*]] = cir.const #cir.zero : !cir.vector<8 x !cir.double>
+  // CIR: %{{.*}} = cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> 
!cir.vector<16 x !cir.float>
+  // CIR: cir.return %{{.*}} : !cir.vector<16 x !cir.float>
+
+  // LLVM-LABEL: test_mm512_undefined_ps
+  // LLVM: store <16 x float> zeroinitializer, ptr %[[A:.*]], align 64
+  // LLVM: %{{.*}} = load <16 x float>, ptr %[[A]], align 64
+  // LLVM: ret <16 x float> %{{.*}}
+
+  // OGCG-LABEL: test_mm512_undefined_ps
+  // OGCG: ret <16 x float> zeroinitializer
+  return _mm512_undefined_ps();
+}
+
+__m512d test_mm512_undefined_pd(void) {
+  // CIR-LABEL: _mm512_undefined_pd
+  // CIR: %{{.*}} = cir.const #cir.zero : !cir.vector<8 x !cir.double>
+  // CIR: cir.return %{{.*}} : !cir.vector<8 x !cir.double>
+
+  // LLVM-LABEL: test_mm512_undefined_pd
+  // LLVM: store <8 x double> zeroinitializer, ptr %[[A:.*]], align 64
+  // LLVM: %{{.*}} = load <8 x double>, ptr %[[A]], align 64
+  // LLVM: ret <8 x double> %{{.*}}
+
+  // OGCG-LABEL: test_mm512_undefined_pd
+  // OGCG: ret <8 x double> zeroinitializer
+  return _mm512_undefined_pd();
+}
+
+__m512i test_mm512_undefined_epi32(void) {
+  // CIR-LABEL: _mm512_undefined_epi32
+  // CIR: %[[A:.*]] = cir.const #cir.zero : !cir.vector<8 x !cir.double>
+  // CIR: %{{.*}} = cir.cast bitcast %[[A]] : !cir.vector<8 x !cir.double> -> 
!cir.vector<8 x !s64i>
+  // CIR: cir.return %{{.*}} : !cir.vector<8 x !s64i>
+
+  // LLVM-LABEL: test_mm512_undefined_epi32
+  // LLVM: store <8 x i64> zeroinitializer, ptr %[[A:.*]], align 64
+  // LLVM: %{{.*}} = load <8 x i64>, ptr %[[A]], align 64
+  // LLVM: ret <8 x i64> %{{.*}}
+
+  // OGCG-LABEL: test_mm512_undefined_epi32
+  // OGCG: ret <8 x i64> zeroinitializer
+  return _mm512_undefined_epi32();
+}
+
+__mmask16 test_mm512_kand(__mmask16 A, __mmask16 B) {
+  // CIR-LABEL: _mm512_kand
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.binop(and, {{.*}}, {{.*}}) : !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<16 x !cir.int<s, 1>> -> !u16i
+
+  // LLVM-LABEL: _mm512_kand
+  // LLVM: [[L:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: [[R:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: [[RES:%.*]] = and <16 x i1> [[L]], [[R]]
+  // LLVM: bitcast <16 x i1> [[RES]] to i16
+
+  // OGCG-LABEL: _mm512_kand
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: and <16 x i1>
+  // OGCG: bitcast <16 x i1> {{.*}} to i16
+  return _mm512_kand(A, B);
+}
+
+__mmask16 test_mm512_kandn(__mmask16 A, __mmask16 B) {
+  // CIR-LABEL: _mm512_kandn
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.unary(not, {{.*}}) : !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.binop(and, {{.*}}, {{.*}}) : !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<16 x !cir.int<s, 1>> -> !u16i
+
+  // LLVM-LABEL: _mm512_kandn
+  // LLVM: [[L:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: [[R:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: xor <16 x i1> [[L]], {{.*}}
+  // LLVM: and <16 x i1>
+  // LLVM: bitcast <16 x i1> {{.*}} to i16
+
+  // OGCG-LABEL: _mm512_kandn
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: xor <16 x i1>
+  // OGCG: and <16 x i1>
+  // OGCG: bitcast <16 x i1> {{.*}} to i16
+  return _mm512_kandn(A, B);
+}
+
+__mmask16 test_mm512_kor(__mmask16 A, __mmask16 B) {
+  // CIR-LABEL: _mm512_kor
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.binop(or, {{.*}}, {{.*}}) : !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<16 x !cir.int<s, 1>> -> !u16i
+
+  // LLVM-LABEL: _mm512_kor
+  // LLVM: [[L:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: [[R:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: or <16 x i1> [[L]], [[R]]
+  // LLVM: bitcast <16 x i1> {{.*}} to i16
+
+  // OGCG-LABEL: _mm512_kor
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: or <16 x i1>
+  // OGCG: bitcast <16 x i1> {{.*}} to i16
+  return _mm512_kor(A, B);
+}
+
+__mmask16 test_mm512_kxor(__mmask16 A, __mmask16 B) {
+  // CIR-LABEL: _mm512_kxor
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.binop(xor, {{.*}}, {{.*}}) : !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<16 x !cir.int<s, 1>> -> !u16i
+
+  // LLVM-LABEL: _mm512_kxor
+  // LLVM: [[L:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: [[R:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: xor <16 x i1> [[L]], [[R]]
+  // LLVM: bitcast <16 x i1> {{.*}} to i16
+
+  // OGCG-LABEL: _mm512_kxor
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: xor <16 x i1>
+  // OGCG: bitcast <16 x i1> {{.*}} to i16
+  return _mm512_kxor(A, B);
+}
+
+__mmask16 test_mm512_knot(__mmask16 A) {
+  // CIR-LABEL: _mm512_knot
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.unary(not, {{.*}}) : !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<16 x !cir.int<s, 1>> -> !u16i
+
+  // LLVM-LABEL: _mm512_knot
+  // LLVM: bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: xor <16 x i1>
+  // LLVM: bitcast <16 x i1> {{.*}} to i16
+
+  // OGCG-LABEL: _mm512_knot
+  // OGCG: xor <16 x i1>
+
+  return _mm512_knot(A);
+}
+
+// Multiple user-level mask helpers inline to this same kmov builtin.
+// CIR does not implement any special lowering for those helpers.
+//
+// Therefore, testing the builtin (__builtin_ia32_kmov*) directly is
+// sufficient to cover the CIR lowering behavior. Testing each helper
+// individually would add no new CIR paths.
+
+__mmask16 test_kmov_w(__mmask16 A) {
+  // CIR-LABEL: test_kmov_w
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<16 x !cir.int<s, 1>> -> !u16i
+
+  // LLVM-LABEL: test_kmov_w
+  // LLVM: bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: bitcast <16 x i1> {{.*}} to i16
+
+  // OGCG-LABEL: test_kmov_w
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: bitcast <16 x i1> {{.*}} to i16
+
+  return __builtin_ia32_kmovw(A);
+}
\ No newline at end of file

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [CIR][X86] Implement lowering for AVX512 mask builtins (PR #169185)

Reply via email to