https://github.com/Priyanshu3820 updated 
https://github.com/llvm/llvm-project/pull/173143

>From 0dd977df6b2fda837c32044c199a13a32c232b6f Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <[email protected]>
Date: Sat, 20 Dec 2025 08:25:59 +0000
Subject: [PATCH 01/13] Implement handling for convert-half builtins

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    | 55 ++++++++++++-
 .../X86/avx512vlbf16-builtins.c               | 80 +++++++++++++++++++
 2 files changed, 132 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/CIR/CodeGenBuiltins/X86/avx512vlbf16-builtins.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 75bf25b20f1af..59d467da3a9fb 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -362,6 +362,27 @@ static mlir::Value emitX86Muldq(CIRGenBuilderTy &builder, 
mlir::Location loc,
   return builder.createMul(loc, lhs, rhs);
 }
 
+static mlir::Value emitX86CvtF16ToFloatExpr(CIRGenBuilderTy &builder,
+                                            mlir::Location loc,
+                                            mlir::Type dstTy,
+                                            SmallVectorImpl<mlir::Value> &ops) 
{
+
+  mlir::Value src = ops[0];
+  mlir::Value passthru = ops[1];
+
+  auto vecTy = mlir::cast<cir::VectorType>(src.getType());
+  uint64_t numElems = vecTy.getSize();
+
+  mlir::Value mask = getMaskVecValue(builder, loc, ops[2], numElems);
+
+  auto halfTy = cir::VectorType::get(builder.getF16Type(), numElems);
+  mlir::Value srcF16 = builder.createBitcast(loc, src, halfTy);
+
+  mlir::Value res = builder.createFloatingCast(srcF16, dstTy);
+
+  return emitX86Select(builder, loc, mask, res, passthru);
+}
+
 static mlir::Value emitX86vpcom(CIRGenBuilderTy &builder, mlir::Location loc,
                                 llvm::SmallVector<mlir::Value> ops,
                                 bool isSigned) {
@@ -1662,12 +1683,40 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_cmpnltsd:
   case X86::BI__builtin_ia32_cmpnlesd:
   case X86::BI__builtin_ia32_cmpordsd:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
   case X86::BI__builtin_ia32_vcvtph2ps_mask:
   case X86::BI__builtin_ia32_vcvtph2ps256_mask:
-  case X86::BI__builtin_ia32_vcvtph2ps512_mask:
-  case X86::BI__builtin_ia32_cvtneps2bf16_128_mask:
+  case X86::BI__builtin_ia32_vcvtph2ps512_mask: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    return emitX86CvtF16ToFloatExpr(builder, loc, convertType(expr->getType()),
+                                    ops);
+  }
+  case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    mlir::Value intrinsicMask = getMaskVecValue(builder, loc, ops[2], 4);
+    return emitIntrinsicCallOp(builder, loc,
+                               "x86.avx512bf16.mask.cvtneps2bf16.128",
+                               convertType(expr->getType()),
+                               mlir::ValueRange{ops[0], ops[1], 
intrinsicMask});
+  }
   case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
-  case X86::BI__builtin_ia32_cvtneps2bf16_512_mask:
+  case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    unsigned numElts = cast<cir::VectorType>(ops[1].getType()).getSize();
+    mlir::Value selectMask = getMaskVecValue(builder, loc, ops[2], numElts);
+    StringRef intrinsicName;
+    if (builtinID == X86::BI__builtin_ia32_cvtneps2bf16_256_mask)
+      intrinsicName = "x86.avx512bf16.cvtneps2bf16.256";
+    else
+      intrinsicName = "x86.avx512bf16.cvtneps2bf16.512";
+    mlir::Value intrinsicResult =
+        emitIntrinsicCallOp(builder, loc, intrinsicName, ops[1].getType(),
+                            mlir::ValueRange{ops[0]});
+    return emitX86Select(builder, loc, selectMask, intrinsicResult, ops[1]);
+  }
   case X86::BI__cpuid:
   case X86::BI__cpuidex:
   case X86::BI__emul:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512vlbf16-builtins.c 
b/clang/test/CIR/CodeGenBuiltins/X86/avx512vlbf16-builtins.c
new file mode 100644
index 0000000000000..ccfc0d4a6a813
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512vlbf16-builtins.c
@@ -0,0 +1,80 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512f -target-feature +avx512vl 
-target-feature +avx512bf16 -fclangir -emit-cir -o %t.cir -Wall -Werror 
-Wsign-conversion 
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512f -target-feature +avx512vl 
-target-feature +avx512bf16 -fclangir -emit-llvm -o %t.ll -Wall -Werror 
-Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s 
-triple=x86_64-unknown-linux -target-feature +avx512f -target-feature +avx512vl 
-target-feature +avx512bf16 -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=OGCG --input-file=%t.ll %s
+
+#include <immintrin.h>
+
+__m256bh test_mm512_mask_cvtneps_pbh(__m256bh src, __mmask16 k, __m512 a) {
+  // CIR-LABEL: @test_mm512_mask_cvtneps_pbh
+  // CIR: cir.call @_mm512_mask_cvtneps_pbh({{.+}}, {{.+}}, {{.+}}) : 
(!cir.vector<16 x !cir.bf16>, !u16i, !cir.vector<16 x !cir.float>) -> 
!cir.vector<16 x !cir.bf16>
+
+  // LLVM-LABEL: @test_mm512_mask_cvtneps_pbh
+  // LLVM: call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512
+
+  // OGCG-LABEL: @test_mm512_mask_cvtneps_pbh
+  // OGCG: call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512
+  return _mm512_mask_cvtneps_pbh(src, k, a);
+}
+
+__m256bh test_mm512_maskz_cvtneps_pbh(__mmask16 k, __m512 a) {
+  // CIR-LABEL: @test_mm512_maskz_cvtneps_pbh
+  // CIR: cir.call @_mm512_maskz_cvtneps_pbh({{.+}}, {{.+}}) : (!u16i, 
!cir.vector<16 x !cir.float>) -> !cir.vector<16 x !cir.bf16>
+
+  // LLVM-LABEL: @test_mm512_maskz_cvtneps_pbh
+  // LLVM: call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x 
float> {{.+}})
+
+  // OGCG-LABEL:  @test_mm512_maskz_cvtneps_pbh
+  // OGCG: call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x 
float> {{.+}})
+  return _mm512_maskz_cvtneps_pbh(k, a);
+}
+
+__m128bh test_mm256_mask_cvtneps_pbh(__m128bh src, __mmask8 k, __m256 a) {
+  // CIR-LABEL: test_mm256_mask_cvtneps_pbh
+  // CIR: cir.call @_mm256_mask_cvtneps_pbh({{.+}}, {{.+}}, {{.+}}) : 
(!cir.vector<8 x !cir.bf16>, !u8i, !cir.vector<8 x !cir.float>) -> 
!cir.vector<8 x !cir.bf16>
+  
+  // LLVM-LABEL: test_mm256_mask_cvtneps_pbh
+  // LLVM: call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> 
{{.+}})
+
+  // OGCG-LABEL: test_mm256_mask_cvtneps_pbh
+  // OGCG: call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> 
{{.+}})  
+  return _mm256_mask_cvtneps_pbh(src, k, a);
+}
+
+__m128bh test_mm256_maskz_cvtneps_pbh(__mmask8 k, __m256 a) {
+  // CIR-LABEL: test_mm256_maskz_cvtneps_pbh
+  // CIR: cir.call @_mm256_maskz_cvtneps_pbh({{.+}}, {{.+}}) : (!u8i, 
!cir.vector<8 x !cir.float>) -> !cir.vector<8 x !cir.bf16>
+
+  // LLVM-LABEL: test_mm256_maskz_cvtneps_pbh
+  // LLVM: call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> 
{{.+}})
+
+  // OGCG-LABEL: test_mm256_maskz_cvtneps_pbh
+  // OGCG: call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> 
{{.+}})
+  return _mm256_maskz_cvtneps_pbh(k, a);
+}
+
+__m128bh test_mm_mask_cvtneps_pbh(__m128bh src, __mmask8 k, __m128 a) {
+  // CIR-LABEL: test_mm_mask_cvtneps_pbh
+  // CIR: cir.call @_mm_mask_cvtneps_pbh({{.+}}, {{.+}}, {{.+}}) : 
(!cir.vector<8 x !cir.bf16>, !u8i, !cir.vector<4 x !cir.float>) -> 
!cir.vector<8 x !cir.bf1{{.+}}
+
+  // LLVM-LABEL: test_mm_mask_cvtneps_pbh
+  // LLVM: call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x 
float> {{.+}}, <8 x bfloat> {{.+}}, <4 x i1> %extract.i)
+
+  // OGCG-LABEL: test_mm_mask_cvtneps_pbh
+  // OGCG: call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x 
float> {{.+}}, <8 x bfloat> {{.+}}, <4 x i1> %extract.i)
+  return _mm_mask_cvtneps_pbh(src, k, a);
+}
+
+__m128bh test_mm_maskz_cvtneps_pbh(__mmask8 k, __m128 a) {
+  // CIR-LABEL: test_mm_maskz_cvtneps_pbh
+  // CIR: cir.call @_mm_maskz_cvtneps_pbh({{.+}}, {{.+}}) : (!u8i, 
!cir.vector<4 x !cir.float>) -> !cir.vector<8 x !cir.bf16>
+  
+  // LLVM-LABEL: test_mm_maskz_cvtneps_pbh
+  // LLVM: call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x 
float> {{.+}}, <8 x bfloat> {{.+}}, <4 x i1> %extract.i)
+
+  // OGCG-LABEL: test_mm_maskz_cvtneps_pbh
+  // OGCG: call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x 
float> {{.+}}, <8 x bfloat> {{.+}}, <4 x i1> %extract.i)
+  return _mm_maskz_cvtneps_pbh(k, a);
+}

>From ed0155382bf68c99fa3a7b158407da3717a73741 Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <[email protected]>
Date: Sat, 20 Dec 2025 12:21:53 +0000
Subject: [PATCH 02/13] Update CIRGenBuiltinX86.cpp

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 45 ++++++++++++++++------
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 59d467da3a9fb..f27b68ca4a437 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -364,23 +364,46 @@ static mlir::Value emitX86Muldq(CIRGenBuilderTy &builder, 
mlir::Location loc,
 
 static mlir::Value emitX86CvtF16ToFloatExpr(CIRGenBuilderTy &builder,
                                             mlir::Location loc,
-                                            mlir::Type dstTy,
-                                            SmallVectorImpl<mlir::Value> &ops) 
{
+                                            SmallVectorImpl<mlir::Value> &ops,
+                                            mlir::Type DstTy) {
+  assert((ops.size() == 1 || ops.size() == 3 || ops.size() == 4) &&
+         "Unknown cvtph2ps intrinsic");
+
+  // If the SAE intrinsic doesn't use default rounding then we can't upgrade.
+  if (ops.size() == 4) {
+    cir::ConstantOp constOp = ops[3].getDefiningOp<cir::ConstantOp>();
+    if (constOp &&
+        mlir::cast<mlir::IntegerAttr>(constOp.getValue()).getInt() != 4) {
+      return emitIntrinsicCallOp(builder, loc, "x86.avx512.mask.vcvtph2ps.512",
+                                 DstTy, ops);
+    }
+  }
 
-  mlir::Value src = ops[0];
-  mlir::Value passthru = ops[1];
+  uint64_t NumDstElts = mlir::cast<cir::VectorType>(DstTy).getSize();
+  mlir::Value Src = ops[0];
 
-  auto vecTy = mlir::cast<cir::VectorType>(src.getType());
-  uint64_t numElems = vecTy.getSize();
+  // Extract the subvector
+  if (NumDstElts != mlir::cast<cir::VectorType>(Src.getType()).getSize()) {
+    assert(NumDstElts == 4 && "Unexpected vector size");
 
-  mlir::Value mask = getMaskVecValue(builder, loc, ops[2], numElems);
+    SmallVector<int32_t, 4> indices = {0, 1, 2, 3};
+    Src = builder.createShuffle(loc, Src, Src, indices);
+  }
 
-  auto halfTy = cir::VectorType::get(builder.getF16Type(), numElems);
-  mlir::Value srcF16 = builder.createBitcast(loc, src, halfTy);
+  // Bitcast from vXi16 to vXf16.
+  cir::VectorType HalfTy =
+      cir::VectorType::get(builder.getF16Type(), NumDstElts);
+  Src = builder.createBitcast(loc, Src, HalfTy);
 
-  mlir::Value res = builder.createFloatingCast(srcF16, dstTy);
+  // Perform the fp-extension.
+  mlir::Value Res = builder.createFloatingCast(Src, DstTy);
+
+  if (ops.size() >= 3) {
+    mlir::Value MaskVec = getMaskVecValue(builder, loc, ops[2], NumDstElts);
+    Res = emitX86Select(builder, loc, MaskVec, Res, ops[1]);
+  }
 
-  return emitX86Select(builder, loc, mask, res, passthru);
+  return Res;
 }
 
 static mlir::Value emitX86vpcom(CIRGenBuilderTy &builder, mlir::Location loc,

>From 50e380f9aef07dc225e37147fa80da57c3839e56 Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <[email protected]>
Date: Sat, 20 Dec 2025 19:16:13 +0000
Subject: [PATCH 03/13] Update CIRGenBuiltinX86.cpp

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 78 ++++------------------
 1 file changed, 13 insertions(+), 65 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index f27b68ca4a437..7862119d659f8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -19,8 +19,9 @@
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
-#include "clang/CIR/MissingFeatures.h"
+
 #include "llvm/Support/ErrorHandling.h"
+#include <cstdint>
 
 using namespace clang;
 using namespace clang::CIRGen;
@@ -362,50 +363,6 @@ static mlir::Value emitX86Muldq(CIRGenBuilderTy &builder, 
mlir::Location loc,
   return builder.createMul(loc, lhs, rhs);
 }
 
-static mlir::Value emitX86CvtF16ToFloatExpr(CIRGenBuilderTy &builder,
-                                            mlir::Location loc,
-                                            SmallVectorImpl<mlir::Value> &ops,
-                                            mlir::Type DstTy) {
-  assert((ops.size() == 1 || ops.size() == 3 || ops.size() == 4) &&
-         "Unknown cvtph2ps intrinsic");
-
-  // If the SAE intrinsic doesn't use default rounding then we can't upgrade.
-  if (ops.size() == 4) {
-    cir::ConstantOp constOp = ops[3].getDefiningOp<cir::ConstantOp>();
-    if (constOp &&
-        mlir::cast<mlir::IntegerAttr>(constOp.getValue()).getInt() != 4) {
-      return emitIntrinsicCallOp(builder, loc, "x86.avx512.mask.vcvtph2ps.512",
-                                 DstTy, ops);
-    }
-  }
-
-  uint64_t NumDstElts = mlir::cast<cir::VectorType>(DstTy).getSize();
-  mlir::Value Src = ops[0];
-
-  // Extract the subvector
-  if (NumDstElts != mlir::cast<cir::VectorType>(Src.getType()).getSize()) {
-    assert(NumDstElts == 4 && "Unexpected vector size");
-
-    SmallVector<int32_t, 4> indices = {0, 1, 2, 3};
-    Src = builder.createShuffle(loc, Src, Src, indices);
-  }
-
-  // Bitcast from vXi16 to vXf16.
-  cir::VectorType HalfTy =
-      cir::VectorType::get(builder.getF16Type(), NumDstElts);
-  Src = builder.createBitcast(loc, Src, HalfTy);
-
-  // Perform the fp-extension.
-  mlir::Value Res = builder.createFloatingCast(Src, DstTy);
-
-  if (ops.size() >= 3) {
-    mlir::Value MaskVec = getMaskVecValue(builder, loc, ops[2], NumDstElts);
-    Res = emitX86Select(builder, loc, MaskVec, Res, ops[1]);
-  }
-
-  return Res;
-}
-
 static mlir::Value emitX86vpcom(CIRGenBuilderTy &builder, mlir::Location loc,
                                 llvm::SmallVector<mlir::Value> ops,
                                 bool isSigned) {
@@ -1706,38 +1663,29 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_cmpnltsd:
   case X86::BI__builtin_ia32_cmpnlesd:
   case X86::BI__builtin_ia32_cmpordsd:
+  case X86::BI__builtin_ia32_vcvtph2ps_mask:
+  case X86::BI__builtin_ia32_vcvtph2ps256_mask:
+  case X86::BI__builtin_ia32_vcvtph2ps512_mask:
     cgm.errorNYI(expr->getSourceRange(),
                  std::string("unimplemented X86 builtin call: ") +
                      getContext().BuiltinInfo.getName(builtinID));
     return mlir::Value{};
-  case X86::BI__builtin_ia32_vcvtph2ps_mask:
-  case X86::BI__builtin_ia32_vcvtph2ps256_mask:
-  case X86::BI__builtin_ia32_vcvtph2ps512_mask: {
-    mlir::Location loc = getLoc(expr->getExprLoc());
-    return emitX86CvtF16ToFloatExpr(builder, loc, convertType(expr->getType()),
-                                    ops);
-  }
-  case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: {
-    mlir::Location loc = getLoc(expr->getExprLoc());
-    mlir::Value intrinsicMask = getMaskVecValue(builder, loc, ops[2], 4);
-    return emitIntrinsicCallOp(builder, loc,
-                               "x86.avx512bf16.mask.cvtneps2bf16.128",
-                               convertType(expr->getType()),
-                               mlir::ValueRange{ops[0], ops[1], 
intrinsicMask});
-  }
+  case X86::BI__builtin_ia32_cvtneps2bf16_128_mask:
   case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
   case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {
     mlir::Location loc = getLoc(expr->getExprLoc());
-    unsigned numElts = cast<cir::VectorType>(ops[1].getType()).getSize();
+    mlir::Type resTy = convertType(expr->getType());
+    unsigned numElts = cast<cir::VectorType>(resTy).getSize();
     mlir::Value selectMask = getMaskVecValue(builder, loc, ops[2], numElts);
     StringRef intrinsicName;
-    if (builtinID == X86::BI__builtin_ia32_cvtneps2bf16_256_mask)
+    if (builtinID == X86::BI__builtin_ia32_cvtneps2bf16_128_mask)
+      intrinsicName = "x86.avx512bf16.cvtneps2bf16.128";
+    else if (builtinID == X86::BI__builtin_ia32_cvtneps2bf16_256_mask)
       intrinsicName = "x86.avx512bf16.cvtneps2bf16.256";
     else
       intrinsicName = "x86.avx512bf16.cvtneps2bf16.512";
-    mlir::Value intrinsicResult =
-        emitIntrinsicCallOp(builder, loc, intrinsicName, ops[1].getType(),
-                            mlir::ValueRange{ops[0]});
+    mlir::Value intrinsicResult = emitIntrinsicCallOp(
+        builder, loc, intrinsicName, resTy, mlir::ValueRange{ops[0]});
     return emitX86Select(builder, loc, selectMask, intrinsicResult, ops[1]);
   }
   case X86::BI__cpuid:

>From b238c17d5d303e2910557b578503aa5ec5fcf2b3 Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <[email protected]>
Date: Sat, 20 Dec 2025 19:18:30 +0000
Subject: [PATCH 04/13] Update CIRGenBuiltinsX86.cpp

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 7862119d659f8..810b027fdb33d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -19,9 +19,8 @@
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
-
+#include "clang/CIR/MissingFeatures.h"
 #include "llvm/Support/ErrorHandling.h"
-#include <cstdint>
 
 using namespace clang;
 using namespace clang::CIRGen;

>From 9d3a326e95d0efe48bb3fdad1d2157106e0fa749 Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <[email protected]>
Date: Sat, 20 Dec 2025 20:01:12 +0000
Subject: [PATCH 05/13] add support for maskz builtins

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 45 ++++++++++++++++++----
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 810b027fdb33d..cea4ef1b91275 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -1675,18 +1675,49 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
     mlir::Location loc = getLoc(expr->getExprLoc());
     mlir::Type resTy = convertType(expr->getType());
     unsigned numElts = cast<cir::VectorType>(resTy).getSize();
-    mlir::Value selectMask = getMaskVecValue(builder, loc, ops[2], numElts);
+    bool isMaskZ = false;
+    if (auto *callExpr = llvm::dyn_cast<clang::CallExpr>(expr)) {
+      if (auto *vecInit =
+              llvm::dyn_cast<clang::InitListExpr>(callExpr->getArg(0))) {
+        isMaskZ =
+            vecInit->getNumInits() == numElts &&
+            llvm::all_of(llvm::seq<unsigned>(0, numElts), [&](unsigned i) {
+              auto *init = vecInit->getInit(i);
+              if (auto *intLit = llvm::dyn_cast<clang::IntegerLiteral>(init))
+                return intLit->getValue().isZero();
+              if (auto *floatLit = 
llvm::dyn_cast<clang::FloatingLiteral>(init))
+                return floatLit->getValue().isZero();
+              return false;
+            });
+      }
+    }
     StringRef intrinsicName;
+    StringRef cirFuncName;
     if (builtinID == X86::BI__builtin_ia32_cvtneps2bf16_128_mask)
-      intrinsicName = "x86.avx512bf16.cvtneps2bf16.128";
+      intrinsicName = "x86.avx512bf16.cvtneps2bf16.128",
+      cirFuncName = isMaskZ ? "_mm_maskz_cvtneps_pbh" : "_mm_mask_cvtneps_pbh";
     else if (builtinID == X86::BI__builtin_ia32_cvtneps2bf16_256_mask)
-      intrinsicName = "x86.avx512bf16.cvtneps2bf16.256";
+      intrinsicName = "x86.avx512bf16.cvtneps2bf16.256",
+      cirFuncName =
+          isMaskZ ? "_mm256_maskz_cvtneps_pbh" : "_mm256_mask_cvtneps_pbh";
     else
-      intrinsicName = "x86.avx512bf16.cvtneps2bf16.512";
-    mlir::Value intrinsicResult = emitIntrinsicCallOp(
-        builder, loc, intrinsicName, resTy, mlir::ValueRange{ops[0]});
-    return emitX86Select(builder, loc, selectMask, intrinsicResult, ops[1]);
+      intrinsicName = "x86.avx512bf16.cvtneps2bf16.512",
+      cirFuncName =
+          isMaskZ ? "_mm512_maskz_cvtneps_pbh" : "_mm512_mask_cvtneps_pbh";
+    if (isMaskZ)
+      return builder
+          .createCallOp(
+              loc, mlir::SymbolRefAttr::get(builder.getContext(), cirFuncName),
+              resTy, {ops[2], ops[0]})
+          .getResult();
+    else {
+      mlir::Value selectMask = getMaskVecValue(builder, loc, ops[2], numElts);
+      mlir::Value intrinsicResult = emitIntrinsicCallOp(
+          builder, loc, intrinsicName, resTy, mlir::ValueRange{ops[0]});
+      return emitX86Select(builder, loc, selectMask, intrinsicResult, ops[1]);
+    }
   }
+
   case X86::BI__cpuid:
   case X86::BI__cpuidex:
   case X86::BI__emul:

>From d2118e69640c48c9b0211b4a7a2abb3d437e5fc5 Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <[email protected]>
Date: Sat, 20 Dec 2025 20:34:04 +0000
Subject: [PATCH 06/13] Update CIRGenBuiltinX86.cpp

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index cea4ef1b91275..f349b795f4e06 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -19,7 +19,6 @@
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
-#include "clang/CIR/MissingFeatures.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace clang;
@@ -1674,6 +1673,10 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
   case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {
     mlir::Location loc = getLoc(expr->getExprLoc());
     mlir::Type resTy = convertType(expr->getType());
+    if (!isa<cir::VectorType>(resTy)) {
+      llvm::report_fatal_error(
+          "Expected cir::VectorType for AVX512 BF16 builtin lowering.");
+    }
     unsigned numElts = cast<cir::VectorType>(resTy).getSize();
     bool isMaskZ = false;
     if (auto *callExpr = llvm::dyn_cast<clang::CallExpr>(expr)) {
@@ -1710,12 +1713,10 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned 
builtinID,
               loc, mlir::SymbolRefAttr::get(builder.getContext(), cirFuncName),
               resTy, {ops[2], ops[0]})
           .getResult();
-    else {
-      mlir::Value selectMask = getMaskVecValue(builder, loc, ops[2], numElts);
-      mlir::Value intrinsicResult = emitIntrinsicCallOp(
-          builder, loc, intrinsicName, resTy, mlir::ValueRange{ops[0]});
-      return emitX86Select(builder, loc, selectMask, intrinsicResult, ops[1]);
-    }
+    mlir::Value selectMask = getMaskVecValue(builder, loc, ops[2], numElts);
+    mlir::Value intrinsicResult = emitIntrinsicCallOp(
+        builder, loc, intrinsicName, resTy, mlir::ValueRange{ops[0]});
+    return emitX86Select(builder, loc, selectMask, intrinsicResult, ops[1]);
   }
 
   case X86::BI__cpuid:

>From 72d3cccc1e0ea58fb662d71822e58416910eefb4 Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <[email protected]>
Date: Sun, 21 Dec 2025 04:10:58 +0000
Subject: [PATCH 07/13] Guard cast to cir::VectorType

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 146dc7dc2bb0a..ed63cdfb99f77 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -1841,8 +1841,7 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, 
const CallExpr *expr) {
     mlir::Location loc = getLoc(expr->getExprLoc());
     mlir::Type resTy = convertType(expr->getType());
     if (!isa<cir::VectorType>(resTy)) {
-      llvm::report_fatal_error(
-          "Expected cir::VectorType for AVX512 BF16 builtin lowering.");
+      return mlir::Value();
     }
     unsigned numElts = cast<cir::VectorType>(resTy).getSize();
     bool isMaskZ = false;

>From a37cef2506a6087a3d9b0fca41b5a5046d349043 Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <[email protected]>
Date: Sun, 21 Dec 2025 05:11:51 +0000
Subject: [PATCH 08/13] Update CIRGenBuiltinX86.cpp and test

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    | 41 +++----------------
 .../X86/avx512vlbf16-builtins.c               | 36 ----------------
 2 files changed, 5 insertions(+), 72 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index ed63cdfb99f77..791c73fc49838 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -1839,46 +1839,15 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, 
const CallExpr *expr) {
   case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
   case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {
     mlir::Location loc = getLoc(expr->getExprLoc());
-    mlir::Type resTy = convertType(expr->getType());
-    if (!isa<cir::VectorType>(resTy)) {
-      return mlir::Value();
-    }
-    unsigned numElts = cast<cir::VectorType>(resTy).getSize();
-    bool isMaskZ = false;
-    if (auto *callExpr = llvm::dyn_cast<clang::CallExpr>(expr)) {
-      if (auto *vecInit =
-              llvm::dyn_cast<clang::InitListExpr>(callExpr->getArg(0))) {
-        isMaskZ =
-            vecInit->getNumInits() == numElts &&
-            llvm::all_of(llvm::seq<unsigned>(0, numElts), [&](unsigned i) {
-              auto *init = vecInit->getInit(i);
-              if (auto *intLit = llvm::dyn_cast<clang::IntegerLiteral>(init))
-                return intLit->getValue().isZero();
-              if (auto *floatLit = 
llvm::dyn_cast<clang::FloatingLiteral>(init))
-                return floatLit->getValue().isZero();
-              return false;
-            });
-      }
-    }
+    cir::VectorType resTy = 
cast<cir::VectorType>(convertType(expr->getType()));
+    unsigned numElts = resTy.getSize();
     StringRef intrinsicName;
-    StringRef cirFuncName;
     if (builtinID == X86::BI__builtin_ia32_cvtneps2bf16_128_mask)
-      intrinsicName = "x86.avx512bf16.cvtneps2bf16.128",
-      cirFuncName = isMaskZ ? "_mm_maskz_cvtneps_pbh" : "_mm_mask_cvtneps_pbh";
+      intrinsicName = "x86.avx512bf16.cvtneps2bf16.128";
     else if (builtinID == X86::BI__builtin_ia32_cvtneps2bf16_256_mask)
-      intrinsicName = "x86.avx512bf16.cvtneps2bf16.256",
-      cirFuncName =
-          isMaskZ ? "_mm256_maskz_cvtneps_pbh" : "_mm256_mask_cvtneps_pbh";
+      intrinsicName = "x86.avx512bf16.cvtneps2bf16.256";
     else
-      intrinsicName = "x86.avx512bf16.cvtneps2bf16.512",
-      cirFuncName =
-          isMaskZ ? "_mm512_maskz_cvtneps_pbh" : "_mm512_mask_cvtneps_pbh";
-    if (isMaskZ)
-      return builder
-          .createCallOp(
-              loc, mlir::SymbolRefAttr::get(builder.getContext(), cirFuncName),
-              resTy, {ops[2], ops[0]})
-          .getResult();
+      intrinsicName = "x86.avx512bf16.cvtneps2bf16.512";
     mlir::Value selectMask = getMaskVecValue(builder, loc, ops[2], numElts);
     mlir::Value intrinsicResult = emitIntrinsicCallOp(
         builder, loc, intrinsicName, resTy, mlir::ValueRange{ops[0]});
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512vlbf16-builtins.c 
b/clang/test/CIR/CodeGenBuiltins/X86/avx512vlbf16-builtins.c
index ccfc0d4a6a813..d312ccbe93807 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx512vlbf16-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512vlbf16-builtins.c
@@ -19,18 +19,6 @@ __m256bh test_mm512_mask_cvtneps_pbh(__m256bh src, __mmask16 
k, __m512 a) {
   return _mm512_mask_cvtneps_pbh(src, k, a);
 }
 
-__m256bh test_mm512_maskz_cvtneps_pbh(__mmask16 k, __m512 a) {
-  // CIR-LABEL: @test_mm512_maskz_cvtneps_pbh
-  // CIR: cir.call @_mm512_maskz_cvtneps_pbh({{.+}}, {{.+}}) : (!u16i, 
!cir.vector<16 x !cir.float>) -> !cir.vector<16 x !cir.bf16>
-
-  // LLVM-LABEL: @test_mm512_maskz_cvtneps_pbh
-  // LLVM: call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x 
float> {{.+}})
-
-  // OGCG-LABEL:  @test_mm512_maskz_cvtneps_pbh
-  // OGCG: call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x 
float> {{.+}})
-  return _mm512_maskz_cvtneps_pbh(k, a);
-}
-
 __m128bh test_mm256_mask_cvtneps_pbh(__m128bh src, __mmask8 k, __m256 a) {
   // CIR-LABEL: test_mm256_mask_cvtneps_pbh
   // CIR: cir.call @_mm256_mask_cvtneps_pbh({{.+}}, {{.+}}, {{.+}}) : 
(!cir.vector<8 x !cir.bf16>, !u8i, !cir.vector<8 x !cir.float>) -> 
!cir.vector<8 x !cir.bf16>
@@ -43,18 +31,6 @@ __m128bh test_mm256_mask_cvtneps_pbh(__m128bh src, __mmask8 
k, __m256 a) {
   return _mm256_mask_cvtneps_pbh(src, k, a);
 }
 
-__m128bh test_mm256_maskz_cvtneps_pbh(__mmask8 k, __m256 a) {
-  // CIR-LABEL: test_mm256_maskz_cvtneps_pbh
-  // CIR: cir.call @_mm256_maskz_cvtneps_pbh({{.+}}, {{.+}}) : (!u8i, 
!cir.vector<8 x !cir.float>) -> !cir.vector<8 x !cir.bf16>
-
-  // LLVM-LABEL: test_mm256_maskz_cvtneps_pbh
-  // LLVM: call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> 
{{.+}})
-
-  // OGCG-LABEL: test_mm256_maskz_cvtneps_pbh
-  // OGCG: call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> 
{{.+}})
-  return _mm256_maskz_cvtneps_pbh(k, a);
-}
-
 __m128bh test_mm_mask_cvtneps_pbh(__m128bh src, __mmask8 k, __m128 a) {
   // CIR-LABEL: test_mm_mask_cvtneps_pbh
   // CIR: cir.call @_mm_mask_cvtneps_pbh({{.+}}, {{.+}}, {{.+}}) : 
(!cir.vector<8 x !cir.bf16>, !u8i, !cir.vector<4 x !cir.float>) -> 
!cir.vector<8 x !cir.bf1{{.+}}
@@ -66,15 +42,3 @@ __m128bh test_mm_mask_cvtneps_pbh(__m128bh src, __mmask8 k, 
__m128 a) {
   // OGCG: call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x 
float> {{.+}}, <8 x bfloat> {{.+}}, <4 x i1> %extract.i)
   return _mm_mask_cvtneps_pbh(src, k, a);
 }
-
-__m128bh test_mm_maskz_cvtneps_pbh(__mmask8 k, __m128 a) {
-  // CIR-LABEL: test_mm_maskz_cvtneps_pbh
-  // CIR: cir.call @_mm_maskz_cvtneps_pbh({{.+}}, {{.+}}) : (!u8i, 
!cir.vector<4 x !cir.float>) -> !cir.vector<8 x !cir.bf16>
-  
-  // LLVM-LABEL: test_mm_maskz_cvtneps_pbh
-  // LLVM: call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x 
float> {{.+}}, <8 x bfloat> {{.+}}, <4 x i1> %extract.i)
-
-  // OGCG-LABEL: test_mm_maskz_cvtneps_pbh
-  // OGCG: call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x 
float> {{.+}}, <8 x bfloat> {{.+}}, <4 x i1> %extract.i)
-  return _mm_maskz_cvtneps_pbh(k, a);
-}

>From cf040a08fa7d0934ec646cdce4b7d2adcafc2c16 Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <[email protected]>
Date: Mon, 22 Dec 2025 14:28:18 +0000
Subject: [PATCH 09/13] Update CIRGenBuiltinX86.cpp

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 30 +++++++++++++++-------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 791c73fc49838..d1eda4baa43c4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -1835,7 +1835,20 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, 
const CallExpr *expr) {
                  std::string("unimplemented X86 builtin call: ") +
                      getContext().BuiltinInfo.getName(builtinID));
     return mlir::Value{};
-  case X86::BI__builtin_ia32_cvtneps2bf16_128_mask:
+  case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    cir::VectorType resTy = 
cast<cir::VectorType>(convertType(expr->getType()));
+    unsigned numElts = resTy.getSize();
+
+    mlir::Value mask = getMaskVecValue(builder, loc, ops[2], numElts);
+
+    SmallVector<mlir::Value, 3> args;
+    args.push_back(ops[0]);
+    args.push_back(ops[1]);
+    args.push_back(mask);
+
+    return emitIntrinsicCallOp(builder, loc, 
"x86.avx512bf16.mask.cvtneps2bf16.128", resTy, args)
+  }
   case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
   case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {
     mlir::Location loc = getLoc(expr->getExprLoc());
@@ -1843,17 +1856,16 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, 
const CallExpr *expr) {
     unsigned numElts = resTy.getSize();
     StringRef intrinsicName;
     if (builtinID == X86::BI__builtin_ia32_cvtneps2bf16_128_mask)
-      intrinsicName = "x86.avx512bf16.cvtneps2bf16.128";
-    else if (builtinID == X86::BI__builtin_ia32_cvtneps2bf16_256_mask)
       intrinsicName = "x86.avx512bf16.cvtneps2bf16.256";
-    else
+    else if (builtinID == X86::BI__builtin_ia32_cvtneps2bf16_256_mask)
       intrinsicName = "x86.avx512bf16.cvtneps2bf16.512";
-    mlir::Value selectMask = getMaskVecValue(builder, loc, ops[2], numElts);
-    mlir::Value intrinsicResult = emitIntrinsicCallOp(
-        builder, loc, intrinsicName, resTy, mlir::ValueRange{ops[0]});
-    return emitX86Select(builder, loc, selectMask, intrinsicResult, ops[1]);
-  }
+    
+    mlir::Value res = emitIntrinsicCallOp(builder, loc, intrinsicName, resTy, 
mlir::ValueRange{ops});
+
+    mlir::Value mask = getMaskVecValue(builder, loc, ops[2], numElts);
 
+    return emitX86Select(builder, loc, mask, res, ops[1]);
+  }
   case X86::BI__cpuid:
   case X86::BI__cpuidex:
   case X86::BI__emul:

>From 00037b3353fcb6a84b52aec1a5f909b2f8aec65e Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <[email protected]>
Date: Mon, 22 Dec 2025 14:30:02 +0000
Subject: [PATCH 10/13] Fix formatting

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index d1eda4baa43c4..0f2d607cb4e0c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -1847,7 +1847,8 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, 
const CallExpr *expr) {
     args.push_back(ops[1]);
     args.push_back(mask);
 
-    return emitIntrinsicCallOp(builder, loc, 
"x86.avx512bf16.mask.cvtneps2bf16.128", resTy, args)
+    return emitIntrinsicCallOp(
+        builder, loc, "x86.avx512bf16.mask.cvtneps2bf16.128", resTy, args)
   }
   case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
   case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {
@@ -1859,8 +1860,9 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, 
const CallExpr *expr) {
       intrinsicName = "x86.avx512bf16.cvtneps2bf16.256";
     else if (builtinID == X86::BI__builtin_ia32_cvtneps2bf16_256_mask)
       intrinsicName = "x86.avx512bf16.cvtneps2bf16.512";
-    
-    mlir::Value res = emitIntrinsicCallOp(builder, loc, intrinsicName, resTy, 
mlir::ValueRange{ops});
+
+    mlir::Value res = emitIntrinsicCallOp(builder, loc, intrinsicName, resTy,
+                                          mlir::ValueRange{ops});
 
     mlir::Value mask = getMaskVecValue(builder, loc, ops[2], numElts);
 

>From 9e570f3fec26199d0de621fc48232f4a0db15d07 Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <[email protected]>
Date: Mon, 22 Dec 2025 14:40:04 +0000
Subject: [PATCH 11/13] Fix syntax error

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 0f2d607cb4e0c..fb43e307181e4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -1848,7 +1848,7 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, 
const CallExpr *expr) {
     args.push_back(mask);
 
     return emitIntrinsicCallOp(
-        builder, loc, "x86.avx512bf16.mask.cvtneps2bf16.128", resTy, args)
+        builder, loc, "x86.avx512bf16.mask.cvtneps2bf16.128", resTy, args);
   }
   case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
   case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {

>From 32924311d73d8457e58c3991039e557c31c702a7 Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <[email protected]>
Date: Mon, 22 Dec 2025 16:54:07 +0000
Subject: [PATCH 12/13] Update CIRGenBuiltinX86.cpp

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index fb43e307181e4..5326fd39e6c1f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -1855,14 +1855,15 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, 
const CallExpr *expr) {
     mlir::Location loc = getLoc(expr->getExprLoc());
     cir::VectorType resTy = 
cast<cir::VectorType>(convertType(expr->getType()));
     unsigned numElts = resTy.getSize();
+    llvm::errs() << "DEBUG: Number of elements is: " << numElts << "\n";
     StringRef intrinsicName;
-    if (builtinID == X86::BI__builtin_ia32_cvtneps2bf16_128_mask)
+    if (builtinID == X86::BI__builtin_ia32_cvtneps2bf16_256_mask)
       intrinsicName = "x86.avx512bf16.cvtneps2bf16.256";
-    else if (builtinID == X86::BI__builtin_ia32_cvtneps2bf16_256_mask)
+    else if (builtinID == X86::BI__builtin_ia32_cvtneps2bf16_512_mask)
       intrinsicName = "x86.avx512bf16.cvtneps2bf16.512";
 
     mlir::Value res = emitIntrinsicCallOp(builder, loc, intrinsicName, resTy,
-                                          mlir::ValueRange{ops});
+                                          mlir::ValueRange{ops[0]});
 
     mlir::Value mask = getMaskVecValue(builder, loc, ops[2], numElts);
 

>From ac275a9f0a917813055aa113db4afdeac4df4b04 Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <[email protected]>
Date: Mon, 22 Dec 2025 17:19:24 +0000
Subject: [PATCH 13/13] Update CIRGenBuiltinX86.cpp and test

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    | 12 +++--
 .../X86/avx512vlbf16-builtins.c               | 51 ++++++++++++++++---
 2 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 5326fd39e6c1f..9d23853acd3d4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -1838,7 +1838,9 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, 
const CallExpr *expr) {
   case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: {
     mlir::Location loc = getLoc(expr->getExprLoc());
     cir::VectorType resTy = 
cast<cir::VectorType>(convertType(expr->getType()));
-    unsigned numElts = resTy.getSize();
+
+    cir::VectorType inputTy = cast<cir::VectorType>(ops[0].getType());
+    unsigned numElts = inputTy.getSize();
 
     mlir::Value mask = getMaskVecValue(builder, loc, ops[2], numElts);
 
@@ -1854,7 +1856,9 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, 
const CallExpr *expr) {
   case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {
     mlir::Location loc = getLoc(expr->getExprLoc());
     cir::VectorType resTy = 
cast<cir::VectorType>(convertType(expr->getType()));
-    unsigned numElts = resTy.getSize();
+    cir::VectorType inputTy = cast<cir::VectorType>(ops[0].getType());
+    unsigned numElts = inputTy.getSize();
+
     llvm::errs() << "DEBUG: Number of elements is: " << numElts << "\n";
     StringRef intrinsicName;
     if (builtinID == X86::BI__builtin_ia32_cvtneps2bf16_256_mask)
@@ -1865,9 +1869,7 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, 
const CallExpr *expr) {
     mlir::Value res = emitIntrinsicCallOp(builder, loc, intrinsicName, resTy,
                                           mlir::ValueRange{ops[0]});
 
-    mlir::Value mask = getMaskVecValue(builder, loc, ops[2], numElts);
-
-    return emitX86Select(builder, loc, mask, res, ops[1]);
+    return emitX86Select(builder, loc, ops[2], res, ops[1]);
   }
   case X86::BI__cpuid:
   case X86::BI__cpuidex:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512vlbf16-builtins.c 
b/clang/test/CIR/CodeGenBuiltins/X86/avx512vlbf16-builtins.c
index d312ccbe93807..31c3ddd2e6259 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx512vlbf16-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512vlbf16-builtins.c
@@ -8,7 +8,7 @@
 #include <immintrin.h>
 
 __m256bh test_mm512_mask_cvtneps_pbh(__m256bh src, __mmask16 k, __m512 a) {
-  // CIR-LABEL: @test_mm512_mask_cvtneps_pbh
+  // CIR-LABEL: test_mm512_mask_cvtneps_pbh
   // CIR: cir.call @_mm512_mask_cvtneps_pbh({{.+}}, {{.+}}, {{.+}}) : 
(!cir.vector<16 x !cir.bf16>, !u16i, !cir.vector<16 x !cir.float>) -> 
!cir.vector<16 x !cir.bf16>
 
   // LLVM-LABEL: @test_mm512_mask_cvtneps_pbh
@@ -19,26 +19,63 @@ __m256bh test_mm512_mask_cvtneps_pbh(__m256bh src, 
__mmask16 k, __m512 a) {
   return _mm512_mask_cvtneps_pbh(src, k, a);
 }
 
+__m256bh test_mm512_maskz_cvtneps_pbh(__mmask16 k, __m512 a) {
+  // CIR-LABEL: test_mm512_maskz_cvtneps_pbh
+  // CIR: cir.call @_mm512_maskz_cvtneps_pbh({{.+}}, {{.+}}) : (!u16i, 
!cir.vector<16 x !cir.float>) -> !cir.vector<16 x !cir.bf16>
+
+  // LLVM-LABEL: @test_mm512_maskz_cvtneps_pbh
+  // LLVM: call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x 
float> {{.+}})
+
+  // OGCG-LABEL:  @test_mm512_maskz_cvtneps_pbh
+  // OGCG: call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x 
float> {{.+}})
+  return _mm512_maskz_cvtneps_pbh(k, a);
+}
+
+
 __m128bh test_mm256_mask_cvtneps_pbh(__m128bh src, __mmask8 k, __m256 a) {
   // CIR-LABEL: test_mm256_mask_cvtneps_pbh
   // CIR: cir.call @_mm256_mask_cvtneps_pbh({{.+}}, {{.+}}, {{.+}}) : 
(!cir.vector<8 x !cir.bf16>, !u8i, !cir.vector<8 x !cir.float>) -> 
!cir.vector<8 x !cir.bf16>
   
-  // LLVM-LABEL: test_mm256_mask_cvtneps_pbh
+  // LLVM-LABEL: @test_mm256_mask_cvtneps_pbh
   // LLVM: call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> 
{{.+}})
 
-  // OGCG-LABEL: test_mm256_mask_cvtneps_pbh
+  // OGCG-LABEL: @test_mm256_mask_cvtneps_pbh
   // OGCG: call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> 
{{.+}})  
   return _mm256_mask_cvtneps_pbh(src, k, a);
 }
 
+__m128bh test_mm256_maskz_cvtneps_pbh(__mmask8 k, __m256 a) {
+  // CIR-LABEL: test_mm256_maskz_cvtneps_pbh
+  // CIR: cir.call @_mm256_maskz_cvtneps_pbh({{.+}}, {{.+}}) : (!u8i, 
!cir.vector<8 x !cir.float>) -> !cir.vector<8 x !cir.bf16>
+
+  // LLVM-LABEL: @test_mm256_maskz_cvtneps_pbh
+  // LLVM: call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> 
{{.+}})
+
+  // OGCG-LABEL: @test_mm256_maskz_cvtneps_pbh
+  // OGCG: call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> 
{{.+}})
+  return _mm256_maskz_cvtneps_pbh(k, a);
+}
+
 __m128bh test_mm_mask_cvtneps_pbh(__m128bh src, __mmask8 k, __m128 a) {
   // CIR-LABEL: test_mm_mask_cvtneps_pbh
   // CIR: cir.call @_mm_mask_cvtneps_pbh({{.+}}, {{.+}}, {{.+}}) : 
(!cir.vector<8 x !cir.bf16>, !u8i, !cir.vector<4 x !cir.float>) -> 
!cir.vector<8 x !cir.bf1{{.+}}
 
-  // LLVM-LABEL: test_mm_mask_cvtneps_pbh
-  // LLVM: call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x 
float> {{.+}}, <8 x bfloat> {{.+}}, <4 x i1> %extract.i)
+  // LLVM-LABEL: @test_mm_mask_cvtneps_pbh
+  // LLVM: call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x 
float> {{.+}}, <8 x bfloat> {{.+}}, <4 x i1> {{.+}})
 
-  // OGCG-LABEL: test_mm_mask_cvtneps_pbh
-  // OGCG: call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x 
float> {{.+}}, <8 x bfloat> {{.+}}, <4 x i1> %extract.i)
+  // OGCG-LABEL: @test_mm_mask_cvtneps_pbh
+  // OGCG: call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x 
float> {{.+}}, <8 x bfloat> {{.+}}, <4 x i1> {{.+}})
   return _mm_mask_cvtneps_pbh(src, k, a);
 }
+
+__m128bh test_mm_maskz_cvtneps_pbh(__mmask8 k, __m128 a) {
+  // CIR-LABEL: test_mm_maskz_cvtneps_pbh
+  // CIR: cir.call @_mm_maskz_cvtneps_pbh({{.+}}, {{.+}}) : (!u8i, 
!cir.vector<4 x !cir.float>) -> !cir.vector<8 x !cir.bf16>
+  
+  // LLVM-LABEL: @test_mm_maskz_cvtneps_pbh
+  // LLVM: call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x 
float> {{.+}}, <8 x bfloat> {{.+}}, <4 x i1> {{.+}})
+
+  // OGCG-LABEL: @test_mm_maskz_cvtneps_pbh
+  // OGCG: call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x 
float> {{.+}}, <8 x bfloat> {{.+}}, <4 x i1> {{.+}})
+  return _mm_maskz_cvtneps_pbh(k, a);
+}

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to