https://github.com/yairbenavraham updated 
https://github.com/llvm/llvm-project/pull/195602

>From 5047649698cd34eaa6e824405782991d66bf2234 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Wed, 29 Apr 2026 19:24:37 +0300
Subject: [PATCH 1/5] [CIR][AArch64] Lower vfmaq_v f32/f64

Lower BI__builtin_neon_vfmaq_v for the vfmaq_f32 and vfmaq_f64 wrappers
through the LLVM fma intrinsic.

Keep vfma_v and vfmaq_f16 outside this focused split.

Move the replaced vfmaq_f32 and vfmaq_f64 tests into neon/vfmaq.c with
CIR coverage.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 23 ++++++-
 clang/test/CodeGen/AArch64/neon-intrinsics.c  | 38 -----------
 clang/test/CodeGen/AArch64/neon/vfmaq.c       | 65 +++++++++++++++++++
 3 files changed, 87 insertions(+), 39 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/neon/vfmaq.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 834f66586833b..349d6c837af12 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -501,6 +501,7 @@ static mlir::Value emitCommonNeonBuiltinExpr(
 
   // Determine the type of this overloaded NEON intrinsic.
   NeonTypeFlags neonType(neonTypeConst->getZExtValue());
+
   const bool isUnsigned = neonType.isUnsigned();
   const bool hasLegalHalfType = cgf.getTarget().hasFastHalfType();
   const bool usgn = neonType.isUnsigned();
@@ -677,7 +678,20 @@ static mlir::Value emitCommonNeonBuiltinExpr(
   case NEON::BI__builtin_neon_vext_v:
   case NEON::BI__builtin_neon_vextq_v:
   case NEON::BI__builtin_neon_vfma_v:
-  case NEON::BI__builtin_neon_vfmaq_v:
+    cgf.cgm.errorNYI(expr->getSourceRange(),
+                     std::string("unimplemented AArch64 builtin call: ") +
+                         ctx.BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
+  case NEON::BI__builtin_neon_vfmaq_v: {
+    mlir::Value op0 = cgf.getBuilder().createBitcast(ops[0], ty);
+    mlir::Value op1 = cgf.getBuilder().createBitcast(ops[1], ty);
+    mlir::Value op2 = cgf.getBuilder().createBitcast(ops[2], ty);
+    llvm::SmallVector<mlir::Value> fmaOps = {op1, op2, op0};
+    return cir::LLVMIntrinsicCallOp::create(
+               cgf.getBuilder(), loc, cgf.getBuilder().getStringAttr("fma"),
+               ty, fmaOps)
+        .getResult();
+  }
   case NEON::BI__builtin_neon_vld1_v:
   case NEON::BI__builtin_neon_vld1q_v:
   case NEON::BI__builtin_neon_vld1_x2_v:
@@ -2092,6 +2106,13 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return mlir::Value{};
   }
 
+  if (builtinID == NEON::BI__builtin_neon_vfmaq_f16) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
+  }
+
   // Handle MSVC intrinsics before argument evaluation to prevent double
   // evaluation.
   assert(!cir::MissingFeatures::msvcBuiltins());
diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c 
b/clang/test/CodeGen/AArch64/neon-intrinsics.c
index 784d9624823d5..64bbf3e90d675 100644
--- a/clang/test/CodeGen/AArch64/neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c
@@ -890,44 +890,6 @@ float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, 
float32x2_t v3) {
   return vfma_f32(v1, v2, v3);
 }
 
-// CHECK-LABEL: define dso_local <4 x float> @test_vfmaq_f32(
-// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], 
<4 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V3]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> 
[[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP6]])
-// CHECK-NEXT:    ret <4 x float> [[TMP9]]
-//
-float32x4_t test_vfmaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
-  return vfmaq_f32(v1, v2, v3);
-}
-
-// CHECK-LABEL: define dso_local <2 x double> @test_vfmaq_f64(
-// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef 
[[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V3]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x 
double> [[TMP7]], <2 x double> [[TMP8]], <2 x double> [[TMP6]])
-// CHECK-NEXT:    ret <2 x double> [[TMP9]]
-//
-float64x2_t test_vfmaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
-  return vfmaq_f64(v1, v2, v3);
-}
-
 // CHECK-LABEL: define dso_local <2 x float> @test_vfms_f32(
 // CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], 
<2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
diff --git a/clang/test/CodeGen/AArch64/neon/vfmaq.c 
b/clang/test/CodeGen/AArch64/neon/vfmaq.c
new file mode 100644
index 0000000000000..54bc9d1a2cc5c
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/neon/vfmaq.c
@@ -0,0 +1,65 @@
+// REQUIRES: aarch64-registered-target || arm-registered-target
+
+// RUN:                   %clang_cc1 -triple arm64-none-linux-gnu 
-target-feature +neon -disable-O0-optnone -flax-vector-conversions=none         
  -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s 
--check-prefixes=LLVM
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu 
-target-feature +neon -disable-O0-optnone -flax-vector-conversions=none 
-fclangir -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s 
--check-prefixes=LLVM %}
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu 
-target-feature +neon -disable-O0-optnone -flax-vector-conversions=none 
-fclangir -emit-cir  -o - %s |                               FileCheck %s 
--check-prefixes=CIR %}
+
+//=============================================================================
+// NOTES
+//
+// This file contains tests that were originally located in:
+//  * clang/test/CodeGen/AArch64/neon-intrinsics.c
+// The main difference is the use of RUN lines that enable ClangIR lowering.
+// This file currently covers the f32/f64 wrappers that lower through
+// BI__builtin_neon_vfmaq_v.
+//
+// ACLE section headings based on v2025Q2 of the ACLE specification:
+//  * 
https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate
+//
+//=============================================================================
+
+#include <arm_neon.h>
+
+//===------------------------------------------------------===//
+// Fused multiply-accumulate, vector quad forms
+//===------------------------------------------------------===//
+
+// CIR-LABEL: @vfmaq_f32(
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x 
!cir.float>) -> !cir.vector<4 x !cir.float>
+
+// LLVM-LABEL: @test_vfmaq_f32(
+float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
+// LLVM-SAME: <4 x float> {{.*}} [[A:%.*]], <4 x float> {{.*}} [[B:%.*]], <4 x 
float> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x float> [[B]] to <4 x i32>
+// LLVM-NEXT: [[C_I:%.*]] = bitcast <4 x float> [[C]] to <4 x i32>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <4 x i32> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <4 x i32> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <4 x i32> [[C_I]] to <16 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <4 x float>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <4 x float>
+// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <4 x float>
+// LLVM-NEXT: [[FMA:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> 
[[B_CAST]], <4 x float> [[C_CAST]], <4 x float> [[A_CAST]])
+// LLVM-NEXT: ret <4 x float> [[FMA]]
+  return vfmaq_f32(a, b, c);
+}
+
+// CIR-LABEL: @vfmaq_f64(
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>, !cir.vector<2 x 
!cir.double>) -> !cir.vector<2 x !cir.double>
+
+// LLVM-LABEL: @test_vfmaq_f64(
+float64x2_t test_vfmaq_f64(float64x2_t a, float64x2_t b, float64x2_t c) {
+// LLVM-SAME: <2 x double> {{.*}} [[A:%.*]], <2 x double> {{.*}} [[B:%.*]], <2 
x double> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x double> [[B]] to <2 x i64>
+// LLVM-NEXT: [[C_I:%.*]] = bitcast <2 x double> [[C]] to <2 x i64>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i64> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i64> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <2 x i64> [[C_I]] to <16 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <2 x double>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <2 x double>
+// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <2 x double>
+// LLVM-NEXT: [[FMA:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> 
[[B_CAST]], <2 x double> [[C_CAST]], <2 x double> [[A_CAST]])
+// LLVM-NEXT: ret <2 x double> [[FMA]]
+  return vfmaq_f64(a, b, c);
+}

>From bbf5ccdb53b77b58cbe8b68bf6a66915412f484b Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Mon, 4 May 2026 15:05:24 +0300
Subject: [PATCH 2/5] [CIR][AArch64] Format vfmaq_v lowering

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 349d6c837af12..6fc63c46e949d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -688,8 +688,8 @@ static mlir::Value emitCommonNeonBuiltinExpr(
     mlir::Value op2 = cgf.getBuilder().createBitcast(ops[2], ty);
     llvm::SmallVector<mlir::Value> fmaOps = {op1, op2, op0};
     return cir::LLVMIntrinsicCallOp::create(
-               cgf.getBuilder(), loc, cgf.getBuilder().getStringAttr("fma"),
-               ty, fmaOps)
+               cgf.getBuilder(), loc, cgf.getBuilder().getStringAttr("fma"), 
ty,
+               fmaOps)
         .getResult();
   }
   case NEON::BI__builtin_neon_vld1_v:

>From 7cb2f9673150a2c410cda037bfbe307be44417c8 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Mon, 4 May 2026 22:40:26 +0300
Subject: [PATCH 3/5] [CIR][AArch64] Refine vfmaq_v lowering

Move the helper definition above the first use.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 32 +++++++++----------
 clang/test/CodeGen/AArch64/neon/vfmaq.c       | 11 ++++---
 2 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 6fc63c46e949d..50b9f8020235e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -85,6 +85,17 @@ findARMVectorIntrinsicInMap(ArrayRef<ARMVectorIntrinsicInfo> 
intrinsicMap,
 
//===----------------------------------------------------------------------===//
 //  Generic helpers
 
//===----------------------------------------------------------------------===//
+// Emit an intrinsic where all operands are of the same type as the result.
+// Depending on mode, this may be a constrained floating-point intrinsic.
+static mlir::Value
+emitCallMaybeConstrainedBuiltin(CIRGenBuilderTy &builder, mlir::Location loc,
+                                StringRef intrName, mlir::Type retTy,
+                                llvm::SmallVector<mlir::Value> &ops) {
+  assert(!cir::MissingFeatures::emitConstrainedFPCall());
+
+  return builder.emitIntrinsicCallOp(loc, intrName, retTy, ops);
+}
+
 static llvm::StringRef getLLVMIntrNameNoPrefix(llvm::Intrinsic::ID intrID) {
   llvm::StringRef llvmIntrName = llvm::Intrinsic::getBaseName(intrID);
   assert(llvmIntrName.starts_with("llvm.") && "Not an LLVM intrinsic!");
@@ -501,7 +512,6 @@ static mlir::Value emitCommonNeonBuiltinExpr(
 
   // Determine the type of this overloaded NEON intrinsic.
   NeonTypeFlags neonType(neonTypeConst->getZExtValue());
-
   const bool isUnsigned = neonType.isUnsigned();
   const bool hasLegalHalfType = cgf.getTarget().hasFastHalfType();
   const bool usgn = neonType.isUnsigned();
@@ -683,14 +693,15 @@ static mlir::Value emitCommonNeonBuiltinExpr(
                          ctx.BuiltinInfo.getName(builtinID));
     return mlir::Value{};
   case NEON::BI__builtin_neon_vfmaq_v: {
+    // NEON intrinsic: vfmaq(accumulator, multiplicand1, multiplicand2)
+    // LLVM intrinsic: fma(multiplicand1, multiplicand2, accumulator)
+    // Reorder arguments to match LLVM fma signature
     mlir::Value op0 = cgf.getBuilder().createBitcast(ops[0], ty);
     mlir::Value op1 = cgf.getBuilder().createBitcast(ops[1], ty);
     mlir::Value op2 = cgf.getBuilder().createBitcast(ops[2], ty);
     llvm::SmallVector<mlir::Value> fmaOps = {op1, op2, op0};
-    return cir::LLVMIntrinsicCallOp::create(
-               cgf.getBuilder(), loc, cgf.getBuilder().getStringAttr("fma"), 
ty,
-               fmaOps)
-        .getResult();
+    return emitCallMaybeConstrainedBuiltin(cgf.getBuilder(), loc, "fma", ty,
+                                           fmaOps);
   }
   case NEON::BI__builtin_neon_vld1_v:
   case NEON::BI__builtin_neon_vld1q_v:
@@ -876,17 +887,6 @@ static mlir::Value emitCommonNeonBuiltinExpr(
   }
 }
 
-// Emit an intrinsic where all operands are of the same type as the result.
-// Depending on mode, this may be a constrained floating-point intrinsic.
-static mlir::Value
-emitCallMaybeConstrainedBuiltin(CIRGenBuilderTy &builder, mlir::Location loc,
-                                StringRef intrName, mlir::Type retTy,
-                                llvm::SmallVector<mlir::Value> &ops) {
-  assert(!cir::MissingFeatures::emitConstrainedFPCall());
-
-  return builder.emitIntrinsicCallOp(loc, intrName, retTy, ops);
-}
-
 bool CIRGenFunction::getAArch64SVEProcessedOperands(
     unsigned builtinID, const CallExpr *expr, SmallVectorImpl<mlir::Value> 
&ops,
     SVETypeFlags typeFlags) {
diff --git a/clang/test/CodeGen/AArch64/neon/vfmaq.c 
b/clang/test/CodeGen/AArch64/neon/vfmaq.c
index 54bc9d1a2cc5c..1c05de703dace 100644
--- a/clang/test/CodeGen/AArch64/neon/vfmaq.c
+++ b/clang/test/CodeGen/AArch64/neon/vfmaq.c
@@ -21,14 +21,15 @@
 #include <arm_neon.h>
 
 //===------------------------------------------------------===//
-// Fused multiply-accumulate, vector quad forms
+// 2.6.1.9.3 Fused multiply-accumulate, vector quad forms
 //===------------------------------------------------------===//
 
-// CIR-LABEL: @vfmaq_f32(
-// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x 
!cir.float>) -> !cir.vector<4 x !cir.float>
 
 // LLVM-LABEL: @test_vfmaq_f32(
+// CIR-LABEL: @vfmaq_f32(
 float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x 
!cir.float>) -> !cir.vector<4 x !cir.float>
+
 // LLVM-SAME: <4 x float> {{.*}} [[A:%.*]], <4 x float> {{.*}} [[B:%.*]], <4 x 
float> {{.*}} [[C:%.*]]) {{.*}} {
 // LLVM:      [[A_I:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
 // LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x float> [[B]] to <4 x i32>
@@ -44,11 +45,11 @@ float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, 
float32x4_t c) {
   return vfmaq_f32(a, b, c);
 }
 
+// LLVM-LABEL: @test_vfmaq_f64(
 // CIR-LABEL: @vfmaq_f64(
+float64x2_t test_vfmaq_f64(float64x2_t a, float64x2_t b, float64x2_t c) {
 // CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>, !cir.vector<2 x 
!cir.double>) -> !cir.vector<2 x !cir.double>
 
-// LLVM-LABEL: @test_vfmaq_f64(
-float64x2_t test_vfmaq_f64(float64x2_t a, float64x2_t b, float64x2_t c) {
 // LLVM-SAME: <2 x double> {{.*}} [[A:%.*]], <2 x double> {{.*}} [[B:%.*]], <2 
x double> {{.*}} [[C:%.*]]) {{.*}} {
 // LLVM:      [[A_I:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
 // LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x double> [[B]] to <2 x i64>

>From 578b3160b1dbadfc4bd19f47490bf3df6e6c5567 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Thu, 7 May 2026 18:23:26 +0300
Subject: [PATCH 4/5] [clang][test] Add %clang_cc1_cg_arm64_neon substitution

Move the vfmaq coverage to fused-multiply.c and use the shared
%clang_cc1_cg_arm64_neon test substitution for LLVM and CIR checks.

Add vfmaq_f16 under +fullfp16, drop its temporary CIR NYI guard, and
remove the duplicated non-constrained legacy check. The constrained-FP
coverage remains.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  |  7 -----
 .../neon/{vfmaq.c => fused-multiply.c}        | 30 ++++++++++++++++---
 .../CodeGen/AArch64/v8.2a-neon-intrinsics.c   | 19 ------------
 3 files changed, 26 insertions(+), 30 deletions(-)
 rename clang/test/CodeGen/AArch64/neon/{vfmaq.c => fused-multiply.c} (63%)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 50b9f8020235e..f253852673059 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2106,13 +2106,6 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return mlir::Value{};
   }
 
-  if (builtinID == NEON::BI__builtin_neon_vfmaq_f16) {
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented AArch64 builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinID));
-    return mlir::Value{};
-  }
-
   // Handle MSVC intrinsics before argument evaluation to prevent double
   // evaluation.
   assert(!cir::MissingFeatures::msvcBuiltins());
diff --git a/clang/test/CodeGen/AArch64/neon/vfmaq.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
similarity index 63%
rename from clang/test/CodeGen/AArch64/neon/vfmaq.c
rename to clang/test/CodeGen/AArch64/neon/fused-multiply.c
index 1c05de703dace..e7957aecccf83 100644
--- a/clang/test/CodeGen/AArch64/neon/vfmaq.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -1,8 +1,10 @@
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
-// RUN:                   %clang_cc1 -triple arm64-none-linux-gnu 
-target-feature +neon -disable-O0-optnone -flax-vector-conversions=none         
  -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s 
--check-prefixes=LLVM
-// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu 
-target-feature +neon -disable-O0-optnone -flax-vector-conversions=none 
-fclangir -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s 
--check-prefixes=LLVM %}
-// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu 
-target-feature +neon -disable-O0-optnone -flax-vector-conversions=none 
-fclangir -emit-cir  -o - %s |                               FileCheck %s 
--check-prefixes=CIR %}
+// RUN:                   %clang_cc1_cg_arm64_neon -target-feature +fullfp16   
        -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | 
FileCheck %s --check-prefixes=ALL,LLVM
+// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -target-feature +fullfp16 
-fclangir -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | 
FileCheck %s --check-prefixes=ALL,LLVM %}
+// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -target-feature +fullfp16 
-fclangir -emit-cir  %s -disable-O0-optnone |                               
FileCheck %s --check-prefixes=ALL,CIR %}
+
+// ALL: {{[Mm]}}odule
 
 //=============================================================================
 // NOTES
@@ -10,7 +12,7 @@
 // This file contains tests that were originally located in:
 //  * clang/test/CodeGen/AArch64/neon-intrinsics.c
 // The main difference is the use of RUN lines that enable ClangIR lowering.
-// This file currently covers the f32/f64 wrappers that lower through
+// This file currently covers the f16/f32/f64 wrappers that lower through
 // BI__builtin_neon_vfmaq_v.
 //
 // ACLE section headings based on v2025Q2 of the ACLE specification:
@@ -25,6 +27,26 @@
 //===------------------------------------------------------===//
 
 
+// LLVM-LABEL: @test_vfmaq_f16(
+// CIR-LABEL: @vfmaq_f16(
+float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x 
!cir.f16>) -> !cir.vector<8 x !cir.f16>
+
+// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <8 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
+// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <8 x half>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <8 x half>
+// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half>
+// LLVM-NEXT: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> 
[[B_CAST]], <8 x half> [[C_CAST]], <8 x half> [[A_CAST]])
+// LLVM-NEXT: ret <8 x half> [[FMA]]
+  return vfmaq_f16(a, b, c);
+}
+
 // LLVM-LABEL: @test_vfmaq_f32(
 // CIR-LABEL: @vfmaq_f32(
 float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c 
b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
index b8380bd8ed6d4..ff1c206fc6350 100644
--- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
@@ -1621,25 +1621,6 @@ float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, 
float16x4_t c) {
   return vfma_f16(a, b, c);
 }
 
-// CHECK-LABEL: define {{[^@]+}}@test_vfmaq_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> 
[[TMP7]], <8 x half> [[TMP8]], <8 x half> [[TMP6]])
-// CHECK-NEXT:    ret <8 x half> [[TMP9]]
-//
-float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
-  return vfmaq_f16(a, b, c);
-}
-
 // CHECK-LABEL: define {{[^@]+}}@test_vfms_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:

>From eaba7884e7dcb6b086bdd2ca41b193c8bddd99b5 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Fri, 8 May 2026 15:09:39 +0300
Subject: [PATCH 5/5] [clang][test] Split vfmaq_f16 fullfp16 test

Move the vfmaq_f16 coverage into fused-multiple-fullfp16.c, with
+fullfp16 enabled there and a note that it came from
v8.2a-neon-intrinsics.c.

Keep fused-multiply.c for the f32/f64 cases and drop +fullfp16 from
that test.
---
 .../AArch64/neon/fused-multiple-fullfp16.c    | 47 +++++++++++++++++++
 .../CodeGen/AArch64/neon/fused-multiply.c     | 31 ++----------
 2 files changed, 52 insertions(+), 26 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c

diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
new file mode 100644
index 0000000000000..af9330865796d
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
@@ -0,0 +1,47 @@
+// REQUIRES: aarch64-registered-target || arm-registered-target
+
+// RUN:                   %clang_cc1_cg_arm64_neon -target-feature +fullfp16   
        -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | 
FileCheck %s --check-prefixes=ALL,LLVM
+// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -target-feature +fullfp16 
-fclangir -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | 
FileCheck %s --check-prefixes=ALL,LLVM %}
+// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -target-feature +fullfp16 
-fclangir -emit-cir  %s -disable-O0-optnone |                               
FileCheck %s --check-prefixes=ALL,CIR %}
+
+// ALL: {{[Mm]}}odule
+
+//=============================================================================
+// NOTES
+//
+// This file contains fullfp16 tests that were originally located in:
+//  * clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
+// The main difference is the use of RUN lines that enable ClangIR lowering.
+// This file currently covers the f16 wrapper that lowers through
+// BI__builtin_neon_vfmaq_v.
+//
+// ACLE section headings based on v2025Q2 of the ACLE specification:
+//  * 
https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate-2
+//
+//=============================================================================
+
+#include <arm_neon.h>
+
+//===------------------------------------------------------===//
+// 2.6.1.9.3 Fused multiply-accumulate, vector quad forms
+//===------------------------------------------------------===//
+
+// LLVM-LABEL: @test_vfmaq_f16(
+// CIR-LABEL: @vfmaq_f16(
+float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x 
!cir.f16>) -> !cir.vector<8 x !cir.f16>
+
+// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <8 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
+// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <8 x half>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <8 x half>
+// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half>
+// LLVM-NEXT: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> 
[[B_CAST]], <8 x half> [[C_CAST]], <8 x half> [[A_CAST]])
+// LLVM-NEXT: ret <8 x half> [[FMA]]
+  return vfmaq_f16(a, b, c);
+}
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
index e7957aecccf83..2501f54fb5427 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -1,8 +1,8 @@
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
-// RUN:                   %clang_cc1_cg_arm64_neon -target-feature +fullfp16   
        -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | 
FileCheck %s --check-prefixes=ALL,LLVM
-// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -target-feature +fullfp16 
-fclangir -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | 
FileCheck %s --check-prefixes=ALL,LLVM %}
-// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -target-feature +fullfp16 
-fclangir -emit-cir  %s -disable-O0-optnone |                               
FileCheck %s --check-prefixes=ALL,CIR %}
+// RUN:                   %clang_cc1_cg_arm64_neon           -emit-llvm %s 
-disable-O0-optnone | opt -S -passes=mem2reg,sroa | FileCheck %s 
--check-prefixes=ALL,LLVM
+// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -fclangir -emit-llvm %s 
-disable-O0-optnone | opt -S -passes=mem2reg,sroa | FileCheck %s 
--check-prefixes=ALL,LLVM %}
+// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -fclangir -emit-cir  %s 
-disable-O0-optnone |                               FileCheck %s 
--check-prefixes=ALL,CIR %}
 
 // ALL: {{[Mm]}}odule
 
@@ -12,7 +12,7 @@
 // This file contains tests that were originally located in:
 //  * clang/test/CodeGen/AArch64/neon-intrinsics.c
 // The main difference is the use of RUN lines that enable ClangIR lowering.
-// This file currently covers the f16/f32/f64 wrappers that lower through
+// This file currently covers the f32/f64 wrappers that lower through
 // BI__builtin_neon_vfmaq_v.
 //
 // ACLE section headings based on v2025Q2 of the ACLE specification:
@@ -23,30 +23,9 @@
 #include <arm_neon.h>
 
 //===------------------------------------------------------===//
-// 2.6.1.9.3 Fused multiply-accumulate, vector quad forms
+// 2.1.1.2.5 Fused multiply-accumulate, vector quad forms
 //===------------------------------------------------------===//
 
-
-// LLVM-LABEL: @test_vfmaq_f16(
-// CIR-LABEL: @vfmaq_f16(
-float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
-// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x 
!cir.f16>) -> !cir.vector<8 x !cir.f16>
-
-// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <8 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
-// LLVM:      [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
-// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
-// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
-// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8>
-// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8>
-// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8>
-// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <8 x half>
-// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <8 x half>
-// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half>
-// LLVM-NEXT: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> 
[[B_CAST]], <8 x half> [[C_CAST]], <8 x half> [[A_CAST]])
-// LLVM-NEXT: ret <8 x half> [[FMA]]
-  return vfmaq_f16(a, b, c);
-}
-
 // LLVM-LABEL: @test_vfmaq_f32(
 // CIR-LABEL: @vfmaq_f32(
 float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to