[clang] [CIR][AArch64] Lower vfma and scalar FMA lane builtins (PR #204819)

Yair Ben Avraham via cfe-commits Thu, 25 Jun 2026 21:55:31 -0700

https://github.com/yairbenavraham updated 
https://github.com/llvm/llvm-project/pull/204819


>From 1428e169ec0a399ab0eb122d98983f2450567d39 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Tue, 16 Jun 2026 14:25:51 +0300
Subject: [PATCH 1/7] [CIR][AArch64] Lower vfma_v builtin

Lower BI__builtin_neon_vfma_v through the existing vfmaq_v FMA path.

Add f16, f32, and f64 coverage for the non-quad vfma_* ACLE wrappers.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  |  6 ++---
 .../AArch64/neon/fused-multiple-fullfp16.c    | 11 ++++++++++
 .../CodeGen/AArch64/neon/fused-multiply.c     | 22 +++++++++++++++++++
 3 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 8b077620d2bab..3900e02472e75 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -837,15 +837,15 @@ static mlir::Value emitCommonNeonBuiltinExpr(
   case NEON::BI__builtin_neon_vcvtx_f32_v:
   case NEON::BI__builtin_neon_vext_v:
   case NEON::BI__builtin_neon_vextq_v:
-  case NEON::BI__builtin_neon_vfma_v:
     cgf.cgm.errorNYI(expr->getSourceRange(),
                      std::string("unimplemented AArch64 builtin call: ") +
                          ctx.BuiltinInfo.getName(builtinID));
     return mlir::Value{};
+  case NEON::BI__builtin_neon_vfma_v:
   case NEON::BI__builtin_neon_vfmaq_v: {
-    // NEON intrinsic: vfmaq(accumulator, multiplicand1, multiplicand2)
+    // NEON intrinsic: vfma(q)(accumulator, multiplicand1, multiplicand2)
     // LLVM intrinsic: fma(multiplicand1, multiplicand2, accumulator)
-    // Reorder arguments to match LLVM fma signature
+    // Reorder arguments to match LLVM fma signature.
     mlir::Value op0 = cgf.getBuilder().createBitcast(ops[0], ty);
     mlir::Value op1 = cgf.getBuilder().createBitcast(ops[1], ty);
     mlir::Value op2 = cgf.getBuilder().createBitcast(ops[2], ty);
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
index 1460fb3b2bae1..25c7214d68ab7 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
@@ -27,6 +27,17 @@
 // 2.6.1.9.3 Fused multiply-accumulate, vector quad forms
 //===------------------------------------------------------===//
 
+// LLVM-LABEL: @test_vfma_f16(
+// CIR-LABEL: @vfma_f16(
+float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<4 x !cir.f16>, !cir.vector<4 x !cir.f16>, !cir.vector<4 x 
!cir.f16>) -> !cir.vector<4 x !cir.f16>
+
+// LLVM-SAME: <4 x half> {{.*}} [[A:%.*]], <4 x half> {{.*}} [[B:%.*]], <4 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> 
[[B_CAST:%.*]], <4 x half> [[C_CAST:%.*]], <4 x half> [[A_CAST:%.*]])
+// LLVM:      ret <4 x half> [[FMA]]
+  return vfma_f16(a, b, c);
+}
+
 // LLVM-LABEL: @test_vfmaq_f16(
 // CIR-LABEL: @vfmaq_f16(
 float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
index 4e30b063064aa..06be15486463e 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -29,6 +29,28 @@
 // 2.1.1.2.5 Fused multiply-accumulate, vector quad forms
 //===------------------------------------------------------===//
 
+// LLVM-LABEL: @test_vfma_f32(
+// CIR-LABEL: @vfma_f32(
+float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>, !cir.vector<2 x 
!cir.float>) -> !cir.vector<2 x !cir.float>
+
+// LLVM-SAME: <2 x float> {{.*}} [[A:%.*]], <2 x float> {{.*}} [[B:%.*]], <2 x 
float> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
[[B_CAST:%.*]], <2 x float> [[C_CAST:%.*]], <2 x float> [[A_CAST:%.*]])
+// LLVM:      ret <2 x float> [[FMA]]
+  return vfma_f32(a, b, c);
+}
+
+// LLVM-LABEL: @test_vfma_f64(
+// CIR-LABEL: @vfma_f64(
+float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<1 x !cir.double>, !cir.vector<1 x !cir.double>, !cir.vector<1 x 
!cir.double>) -> !cir.vector<1 x !cir.double>
+
+// LLVM-SAME: <1 x double> {{.*}} [[A:%.*]], <1 x double> {{.*}} [[B:%.*]], <1 
x double> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[FMA:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> 
[[B_CAST:%.*]], <1 x double> [[C_CAST:%.*]], <1 x double> [[A_CAST:%.*]])
+// LLVM:      ret <1 x double> [[FMA]]
+  return vfma_f64(a, b, c);
+}
+
 // LLVM-LABEL: @test_vfmaq_f32(
 // CIR-LABEL: @vfmaq_f32(
 float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {

>From 526cd1e6e7b54f2e022f25dc4a03add9719d89e9 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Tue, 16 Jun 2026 14:40:53 +0300
Subject: [PATCH 2/7] [CIR][AArch64] Lower vfma_lane_v builtin

Lower BI__builtin_neon_vfma_lane_v through the existing vfmaq_lane_v path.

Add f16, f32, and f64 coverage for the non-quad vfma_lane_* ACLE wrappers.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 15 +++--
 .../AArch64/neon/fused-multiple-fullfp16.c    | 57 ++++++++++++-------
 .../CodeGen/AArch64/neon/fused-multiply.c     | 24 ++++++++
 3 files changed, 66 insertions(+), 30 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 3900e02472e75..a692642ee7306 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2678,17 +2678,16 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return builder.createBitcast(ops[0], ty);
   }
   case NEON::BI__builtin_neon_vfma_lane_v:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented AArch64 builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinID));
-    return mlir::Value{};
   case NEON::BI__builtin_neon_vfmaq_lane_v: {
     mlir::Value addend = builder.createBitcast(ops[0], ty);
     mlir::Value multiplicand = builder.createBitcast(ops[1], ty);
-    // The lane source operand is the non-quad vector, so it has half as many
-    // lanes as the quad result vector.
-    cir::VectorType sourceTy =
-        cir::VectorType::get(ty.getElementType(), ty.getSize() / 2);
+    // For vfmaq_lane, the lane source operand is the non-quad vector, so it 
has
+    // half as many lanes as the quad result vector. For vfma_lane, it has the
+    // same shape as the result vector.
+    cir::VectorType sourceTy = cir::VectorType::get(
+        ty.getElementType(), builtinID == NEON::BI__builtin_neon_vfmaq_lane_v
+                                 ? ty.getSize() / 2
+                                 : ty.getSize());
     mlir::Value laneSource = builder.createBitcast(ops[2], sourceTy);
     laneSource = emitNeonSplat(builder, loc, laneSource, ops[3], ty.getSize());
 
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
index 25c7214d68ab7..30a0eb148e1b1 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
@@ -58,6 +58,19 @@ float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, 
float16x8_t c) {
   return vfmaq_f16(a, b, c);
 }
 
+// ALL-LABEL: @test_vfma_lane_f16(
+float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b,
+                                float16x4_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.f16>) [#cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i, 
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.f16>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<4 x !cir.f16>, !cir.vector<4 x !cir.f16>, !cir.vector<4 x 
!cir.f16>) -> !cir.vector<4 x !cir.f16>
+
+// LLVM-SAME: <4 x half> {{.*}} [[A:%.*]], <4 x half> {{.*}} [[B:%.*]], <4 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = shufflevector <4 x half> {{.*}}, <4 x half> 
{{.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// LLVM:      [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> 
[[B_CAST:%.*]], <4 x half> [[LANE]], <4 x half> [[A_CAST:%.*]])
+// LLVM:      ret <4 x half> [[FMA]]
+  return vfma_lane_f16(a, b, c, 3);
+}
+
 // ALL-LABEL: @test_vfmaq_lane_f16(
 float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b,
                                  float16x4_t c) {
@@ -78,28 +91,6 @@ float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b,
   return vfmaq_lane_f16(a, b, c, 3);
 }
 
-// ALL-LABEL: @test_vfmaq_laneq_f16(
-float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b,
-                                  float16x8_t c) {
-// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.f16>) [#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, 
#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : 
!s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.f16>
-// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : 
(!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x 
!cir.f16>) -> !cir.vector<8 x !cir.f16>
-
-// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <8 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
-// LLVM:      [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
-// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
-// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
-// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8>
-// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8>
-// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8>
-// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <8 x half>
-// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <8 x half>
-// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half>
-// LLVM-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C_CAST]], <8 x half> 
{{.*}}, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// LLVM-NEXT: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> 
[[LANE]], <8 x half> [[B_CAST]], <8 x half> [[A_CAST]])
-// LLVM:      ret <8 x half> [[FMA]]
-  return vfmaq_laneq_f16(a, b, c, 7);
-}
-
 // ALL-LABEL: @test_vfma_laneq_f16(
 float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b,
                                  float16x8_t c) {
@@ -121,3 +112,25 @@ float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t 
b,
 // LLVM:      ret <4 x half> [[FMA]]
   return vfma_laneq_f16(a, b, c, 7);
 }
+
+// ALL-LABEL: @test_vfmaq_laneq_f16(
+float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b,
+                                  float16x8_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.f16>) [#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, 
#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : 
!s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.f16>
+// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : 
(!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x 
!cir.f16>) -> !cir.vector<8 x !cir.f16>
+
+// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <8 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
+// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <8 x half>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <8 x half>
+// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C_CAST]], <8 x half> 
{{.*}}, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// LLVM-NEXT: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> 
[[LANE]], <8 x half> [[B_CAST]], <8 x half> [[A_CAST]])
+// LLVM:      ret <8 x half> [[FMA]]
+  return vfmaq_laneq_f16(a, b, c, 7);
+}
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
index 06be15486463e..8347038cea0db 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -91,6 +91,30 @@ float64x2_t test_vfmaq_f64(float64x2_t a, float64x2_t b, 
float64x2_t c) {
   return vfmaq_f64(a, b, c);
 }
 
+// ALL-LABEL: @test_vfma_lane_f32(
+float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x 
!cir.float>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x 
!cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>, !cir.vector<2 x 
!cir.float>) -> !cir.vector<2 x !cir.float>
+
+// LLVM-SAME: <2 x float> {{.*}} [[A:%.*]], <2 x float> {{.*}} [[B:%.*]], <2 x 
float> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = shufflevector <2 x float> {{.*}}, <2 x float> 
{{.*}}, <2 x i32> <i32 1, i32 1>
+// LLVM:      [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
[[B_CAST:%.*]], <2 x float> [[LANE]], <2 x float> [[A_CAST:%.*]])
+// LLVM:      ret <2 x float> [[FMA]]
+  return vfma_lane_f32(a, b, v, 1);
+}
+
+// ALL-LABEL: @test_vfma_lane_f64(
+float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<1 x 
!cir.double>) [#cir.int<0> : !s32i] : !cir.vector<1 x !cir.double>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<1 x !cir.double>, !cir.vector<1 x !cir.double>, !cir.vector<1 x 
!cir.double>) -> !cir.vector<1 x !cir.double>
+
+// LLVM-SAME: <1 x double> {{.*}} [[A:%.*]], <1 x double> {{.*}} [[B:%.*]], <1 
x double> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = shufflevector <1 x double> {{.*}}, <1 x double> 
{{.*}}, <1 x i32> zeroinitializer
+// LLVM:      [[FMA:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> 
[[B_CAST:%.*]], <1 x double> [[LANE]], <1 x double> [[A_CAST:%.*]])
+// LLVM:      ret <1 x double> [[FMA]]
+  return vfma_lane_f64(a, b, v, 0);
+}
+
 // ALL-LABEL: @test_vfmaq_lane_f32(
 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
 // CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x 
!cir.float>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, 
#cir.int<1> : !s32i] : !cir.vector<4 x !cir.float>

>From cd4e4e6bb67c56b51a539ab6124fd781554aa242 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Tue, 16 Jun 2026 15:08:57 +0300
Subject: [PATCH 3/7] [CIR][AArch64] Lower vfmas lane builtins

Lower BI__builtin_neon_vfmas_lane_f32 and
BI__builtin_neon_vfmas_laneq_f32 by extracting the selected lane and
emitting llvm.fma.

Add scalar f32 lane and laneq coverage.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 28 +++++++++++++++++--
 .../CodeGen/AArch64/neon/fused-multiply.c     | 24 ++++++++++++++++
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index a692642ee7306..5be3d4f98c9f2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2737,9 +2737,33 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps);
   }
   case NEON::BI__builtin_neon_vfmah_lane_f16:
-  case NEON::BI__builtin_neon_vfmas_lane_f32:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
+  case NEON::BI__builtin_neon_vfmas_lane_f32: {
+    // Scalar lane/laneq forms use one selected element from the lane source.
+    mlir::Value laneSource = builder.createExtractElement(
+        loc, ops[2], static_cast<uint64_t>(getIntValueFromConstOp(ops[3])));
+
+    llvm::SmallVector<mlir::Value> fmaOps = {ops[1], laneSource, ops[0]};
+    return emitCallMaybeConstrainedBuiltin(
+        builder, loc, "fma", convertType(expr->getType()), fmaOps);
+  }
   case NEON::BI__builtin_neon_vfmah_laneq_f16:
-  case NEON::BI__builtin_neon_vfmas_laneq_f32:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
+  case NEON::BI__builtin_neon_vfmas_laneq_f32: {
+    // Scalar lane/laneq forms use one selected element from the lane source.
+    mlir::Value laneSource = builder.createExtractElement(
+        loc, ops[2], static_cast<uint64_t>(getIntValueFromConstOp(ops[3])));
+
+    llvm::SmallVector<mlir::Value> fmaOps = {ops[1], laneSource, ops[0]};
+    return emitCallMaybeConstrainedBuiltin(
+        builder, loc, "fma", convertType(expr->getType()), fmaOps);
+  }
   case NEON::BI__builtin_neon_vfmad_lane_f64:
     cgm.errorNYI(expr->getSourceRange(),
                  std::string("unimplemented AArch64 builtin call: ") +
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
index 8347038cea0db..8938958c05f38 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -310,6 +310,30 @@ float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, 
float64x2_t b,
   return vfmaq_laneq_f64(a, b, v, 0);
 }
 
+// ALL-LABEL: @test_vfmas_lane_f32(
+float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : 
!cir.vector<2 x !cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.float, !cir.float, !cir.float) -> !cir.float
+
+// LLVM-SAME: float {{.*}} [[A:%.*]], float {{.*}} [[B:%.*]], <2 x float> 
{{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = extractelement <2 x float> [[C]], i{{32|64}} 1
+// LLVM:      [[FMA:%.*]] = call float @llvm.fma.f32(float [[B]], float 
[[LANE]], float [[A]])
+// LLVM:      ret float [[FMA]]
+  return vfmas_lane_f32(a, b, c, 1);
+}
+
+// ALL-LABEL: @test_vfmas_laneq_f32(
+float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : 
!cir.vector<4 x !cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.float, !cir.float, !cir.float) -> !cir.float
+
+// LLVM-SAME: float {{.*}} [[A:%.*]], float {{.*}} [[B:%.*]], <4 x float> 
{{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = extractelement <4 x float> [[C]], i{{32|64}} 3
+// LLVM:      [[FMA:%.*]] = call float @llvm.fma.f32(float [[B]], float 
[[LANE]], float [[A]])
+// LLVM:      ret float [[FMA]]
+  return vfmas_laneq_f32(a, b, c, 3);
+}
+
 // ALL-LABEL: @test_vfmad_laneq_f64(
 float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
 // CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : 
!cir.vector<2 x !cir.double>

>From e2df24008b01866204aee437bdc222ef752c68a7 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Tue, 23 Jun 2026 11:44:18 +0300
Subject: [PATCH 4/7] [CIR][AArch64] Remove moved FMA tests

Remove legacy normal CodeGen checks for NEON FMA wrappers that are now
covered by the CIR-enabled fused-multiply tests.

Leave constrained-FP coverage and unrelated lane variants in place.
---
 clang/test/CodeGen/AArch64/neon-2velem.c      | 29 -------------
 clang/test/CodeGen/AArch64/neon-intrinsics.c  | 42 -------------------
 .../AArch64/neon-scalar-x-indexed-elem.c      | 34 ---------------
 .../CodeGen/AArch64/v8.2a-neon-intrinsics.c   | 39 -----------------
 4 files changed, 144 deletions(-)

diff --git a/clang/test/CodeGen/AArch64/neon-2velem.c 
b/clang/test/CodeGen/AArch64/neon-2velem.c
index c7eca2d8426c6..273d8ce3cb316 100644
--- a/clang/test/CodeGen/AArch64/neon-2velem.c
+++ b/clang/test/CodeGen/AArch64/neon-2velem.c
@@ -405,25 +405,6 @@ uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t 
v) {
   return vmulq_laneq_u32(a, v, 3);
 }
 
-// CHECK-LABEL: @test_vfma_lane_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x 
float> [[TMP6]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
[[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
-// CHECK-NEXT:    ret <2 x float> [[FMLA2]]
-//
-float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  return vfma_lane_f32(a, b, v, 1);
-}
-
 // CHECK-LABEL: @test_vfms_lane_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
@@ -545,16 +526,6 @@ float64x2_t test_vfmsq_laneq_f64(float64x2_t a, 
float64x2_t b, float64x2_t v) {
   return vfmsq_laneq_f64(a, b, v, 1);
 }
 
-// CHECK-LABEL: @test_vfmas_laneq_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B:%.*]], 
float [[EXTRACT]], float [[A:%.*]])
-// CHECK-NEXT:    ret float [[TMP0]]
-//
-float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
-  return vfmas_laneq_f32(a, b, v, 3);
-}
-
 // CHECK-LABEL: @test_vfmsd_lane_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[B:%.*]]
diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c 
b/clang/test/CodeGen/AArch64/neon-intrinsics.c
index b37ed5aa29f10..37c9bc9ea8dee 100644
--- a/clang/test/CodeGen/AArch64/neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c
@@ -487,25 +487,6 @@ float64x2_t test_vmlsq_f64(float64x2_t v1, float64x2_t v2, 
float64x2_t v3) {
   return vmlsq_f64(v1, v2, v3);
 }
 
-// CHECK-LABEL: define dso_local <2 x float> @test_vfma_f32(
-// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], 
<2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V3]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
[[TMP7]], <2 x float> [[TMP8]], <2 x float> [[TMP6]])
-// CHECK-NEXT:    ret <2 x float> [[TMP9]]
-//
-float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
-  return vfma_f32(v1, v2, v3);
-}
-
 // CHECK-LABEL: define dso_local <2 x float> @test_vfms_f32(
 // CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], 
<2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -17393,28 +17374,6 @@ float64x1_t test_vmls_f64(float64x1_t a, float64x1_t 
b, float64x1_t c) {
   return vmls_f64(a, b, c);
 }
 
-// CHECK-LABEL: define dso_local <1 x double> @test_vfma_f64(
-// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], 
<1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
-// CHECK-NEXT:    [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x 
i64> undef, i64 [[TMP0]], i32 0
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
-// CHECK-NEXT:    [[__P1_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x 
i64> undef, i64 [[TMP1]], i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[C]] to i64
-// CHECK-NEXT:    [[__P2_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x 
i64> undef, i64 [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> 
[[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> 
[[__P1_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> 
[[__P2_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x 
double> [[TMP7]], <1 x double> [[TMP8]], <1 x double> [[TMP6]])
-// CHECK-NEXT:    ret <1 x double> [[TMP9]]
-//
-float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
-  return vfma_f64(a, b, c);
-}
-
 // CHECK-LABEL: define dso_local <1 x double> @test_vfms_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], 
<1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -17788,4 +17747,3 @@ float64x1_t test_vrecps_f64(float64x1_t a, float64x1_t 
b) {
 float64x1_t test_vrsqrts_f64(float64x1_t a, float64x1_t b) {
   return vrsqrts_f64(a, b);
 }
-
diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c 
b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
index fdb772a79e973..59be5ac738278 100644
--- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
+++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
@@ -148,17 +148,6 @@ float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, 
float64x2_t b) {
 }
 
 
-// CHECK-LABEL: define dso_local float @test_vfmas_lane_f32(
-// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> 
noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B]], float 
[[EXTRACT]], float [[A]])
-// CHECK-NEXT:    ret float [[TMP0]]
-//
-float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
-  return vfmas_lane_f32(a, b, c, 1);
-}
-
 // CHECK-LABEL: define dso_local double @test_vfmad_lane_f64(
 // CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <1 x 
double> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -182,29 +171,6 @@ float32_t test_vfmss_lane_f32(float32_t a, float32_t b, 
float32x2_t c) {
   return vfmss_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define dso_local <1 x double> @test_vfma_lane_f64(
-// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], 
<1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
-// CHECK-NEXT:    [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> 
undef, i64 [[TMP0]], i32 0
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
-// CHECK-NEXT:    [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> 
undef, i64 [[TMP1]], i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64
-// CHECK-NEXT:    [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> 
undef, i64 [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> 
[[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> 
[[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> 
[[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x 
double> [[TMP6]], <1 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x 
double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
-// CHECK-NEXT:    ret <1 x double> [[FMLA2]]
-//
-float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
-  return vfma_lane_f64(a, b, v, 0);
-}
-
 // CHECK-LABEL: define dso_local <1 x double> @test_vfms_lane_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], 
<1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c 
b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
index 669c426b42687..a2d6c9d0bff89 100644
--- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
@@ -1550,25 +1550,6 @@ float16x8_t test_vsubq_f16(float16x8_t a, float16x8_t b) 
{
   return vsubq_f16(a, b);
 }
 
-// CHECK-LABEL: define {{[^@]+}}@test_vfma_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> 
[[TMP7]], <4 x half> [[TMP8]], <4 x half> [[TMP6]])
-// CHECK-NEXT:    ret <4 x half> [[TMP9]]
-//
-float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
-  return vfma_f16(a, b, c);
-}
-
 // CHECK-LABEL: define {{[^@]+}}@test_vfms_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
@@ -1609,26 +1590,6 @@ float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, 
float16x8_t c) {
   return vfmsq_f16(a, b, c);
 }
 
-// CHECK-LABEL: define {{[^@]+}}@test_vfma_lane_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> 
[[TMP6]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> 
[[FMLA]], <4 x half> [[LANE]], <4 x half> [[FMLA1]])
-// CHECK-NEXT:    ret <4 x half> [[FMLA2]]
-//
-float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
-  return vfma_lane_f16(a, b, c, 3);
-}
-
 // CHECK-LABEL: define {{[^@]+}}@test_vfma_n_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], 
half noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:

>From 5e08f7e7ce7d724f3ce5b5fdf8bbcc34ece81162 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Thu, 25 Jun 2026 06:46:44 +0300
Subject: [PATCH 5/7] [CIR][AArch64] Lower fp16 scalar FMA lanes

Share scalar f16 and f32 FMA lane lowering so vfmah_lane_f16 and
vfmah_laneq_f16 use the same CIR path as the f32 variants.

Represent Poly128 as a 16-byte integer vector in CIR, matching classic
CodeGen, so the f16 laneq path no longer reports an unrelated NYI.

Tighten the moved Neon FMA LLVM checks so FileCheck captures are
introduced where values are produced instead of inside fma calls.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 31 ++--------
 .../AArch64/neon/fused-multiple-fullfp16.c    | 54 +++++++++++++++--
 .../CodeGen/AArch64/neon/fused-multiply.c     | 58 +++++++++++++++----
 3 files changed, 102 insertions(+), 41 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 5be3d4f98c9f2..8c291abcd9681 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -492,9 +492,7 @@ emitAArch64CompareBuiltinExpr(CIRGenFunction &cgf, 
CIRGenBuilderTy &builder,
   return builder.createCast(loc, cir::CastKind::integral, cmp, retTy);
 }
 
-// TODO(cir): Remove `loc` from the list of arguments once all NYIs are gone.
 static cir::VectorType getNeonType(CIRGenFunction *cgf, NeonTypeFlags 
typeFlags,
-                                   mlir::Location loc,
                                    bool hasLegalHalfType = true,
                                    bool v1Ty = false,
                                    bool allowBFloatArgsAndRet = true) {
@@ -535,8 +533,7 @@ static cir::VectorType getNeonType(CIRGenFunction *cgf, 
NeonTypeFlags typeFlags,
     // FIXME: i128 and f128 doesn't get fully support in Clang and llvm.
     // There is a lot of i128 and f128 API missing.
     // so we use v16i8 to represent poly128 and get pattern matched.
-    cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Poly128"));
-    [[fallthrough]];
+    return cir::VectorType::get(cgf->uInt8Ty, 16);
   case NeonTypeFlags::Float32:
     return cir::VectorType::get(cgf->getCIRGenModule().floatTy,
                                 v1Ty ? 1 : (2 << isQuad));
@@ -648,8 +645,8 @@ static mlir::Value emitCommonNeonBuiltinExpr(
   // FIXME
   // getTargetHooks().getABIInfo().allowBFloatArgsAndRet();
 
-  cir::VectorType vTy = getNeonType(&cgf, neonType, loc, hasLegalHalfType,
-                                    false, allowBFloatArgsAndRet);
+  cir::VectorType vTy = getNeonType(&cgf, neonType, hasLegalHalfType, false,
+                                    allowBFloatArgsAndRet);
   cir::VectorType ty = vTy;
   if (!ty)
     return nullptr;
@@ -2655,7 +2652,7 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return mlir::Value{};
   }
 
-  cir::VectorType ty = getNeonType(this, type, loc);
+  cir::VectorType ty = getNeonType(this, type);
   if (!ty)
     return nullptr;
 
@@ -2737,24 +2734,8 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps);
   }
   case NEON::BI__builtin_neon_vfmah_lane_f16:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented AArch64 builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinID));
-    return mlir::Value{};
-  case NEON::BI__builtin_neon_vfmas_lane_f32: {
-    // Scalar lane/laneq forms use one selected element from the lane source.
-    mlir::Value laneSource = builder.createExtractElement(
-        loc, ops[2], static_cast<uint64_t>(getIntValueFromConstOp(ops[3])));
-
-    llvm::SmallVector<mlir::Value> fmaOps = {ops[1], laneSource, ops[0]};
-    return emitCallMaybeConstrainedBuiltin(
-        builder, loc, "fma", convertType(expr->getType()), fmaOps);
-  }
+  case NEON::BI__builtin_neon_vfmas_lane_f32:
   case NEON::BI__builtin_neon_vfmah_laneq_f16:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented AArch64 builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinID));
-    return mlir::Value{};
   case NEON::BI__builtin_neon_vfmas_laneq_f32: {
     // Scalar lane/laneq forms use one selected element from the lane source.
     mlir::Value laneSource = builder.createExtractElement(
@@ -2927,7 +2908,7 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
   case NEON::BI__builtin_neon_vcvtq_f64_v:
     ops[0] = builder.createBitcast(ops[0], ty);
     ty = getNeonType(
-        this, NeonTypeFlags(NeonTypeFlags::Float64, false, type.isQuad()), 
loc);
+        this, NeonTypeFlags(NeonTypeFlags::Float64, false, type.isQuad()));
     return builder.createCast(loc, cir::CastKind::int_to_float, ops[0], ty);
   case NEON::BI__builtin_neon_vcvt_f64_f32:
   case NEON::BI__builtin_neon_vcvt_f32_f64:
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
index 30a0eb148e1b1..65a6f5a31f6a8 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
@@ -14,7 +14,9 @@
 // The main difference is the use of RUN lines that enable ClangIR lowering.
 // This file currently covers the f16 wrappers that lower through
 // BI__builtin_neon_vfmaq_v, BI__builtin_neon_vfmaq_lane_v,
-// BI__builtin_neon_vfmaq_laneq_v, and BI__builtin_neon_vfma_laneq_v.
+// BI__builtin_neon_vfmaq_laneq_v, BI__builtin_neon_vfma_laneq_v,
+// BI__builtin_neon_vfmah_lane_f16, and
+// BI__builtin_neon_vfmah_laneq_f16.
 //
 // ACLE section headings based on v2025Q2 of the ACLE specification:
 //  * 
https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate-2
@@ -33,8 +35,17 @@ float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, 
float16x4_t c) {
 // CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<4 x !cir.f16>, !cir.vector<4 x !cir.f16>, !cir.vector<4 x 
!cir.f16>) -> !cir.vector<4 x !cir.f16>
 
 // LLVM-SAME: <4 x half> {{.*}} [[A:%.*]], <4 x half> {{.*}} [[B:%.*]], <4 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
-// LLVM:      [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> 
[[B_CAST:%.*]], <4 x half> [[C_CAST:%.*]], <4 x half> [[A_CAST:%.*]])
-// LLVM:      ret <4 x half> [[FMA]]
+// LLVM:      [[A_I:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x half> [[B]] to <4 x i16>
+// LLVM-NEXT: [[C_I:%.*]] = bitcast <4 x half> [[C]] to <4 x i16>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <4 x i16> [[A_I]] to <8 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <4 x i16> [[B_I]] to <8 x i8>
+// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <4 x i16> [[C_I]] to <8 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <8 x i8> [[A_BYTES]] to <4 x half>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <8 x i8> [[B_BYTES]] to <4 x half>
+// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <8 x i8> [[C_BYTES]] to <4 x half>
+// LLVM-NEXT: [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> 
[[B_CAST]], <4 x half> [[C_CAST]], <4 x half> [[A_CAST]])
+// LLVM-NEXT: ret <4 x half> [[FMA]]
   return vfma_f16(a, b, c);
 }
 
@@ -65,8 +76,15 @@ float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b,
 // CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<4 x !cir.f16>, !cir.vector<4 x !cir.f16>, !cir.vector<4 x 
!cir.f16>) -> !cir.vector<4 x !cir.f16>
 
 // LLVM-SAME: <4 x half> {{.*}} [[A:%.*]], <4 x half> {{.*}} [[B:%.*]], <4 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
-// LLVM:      [[LANE:%.*]] = shufflevector <4 x half> {{.*}}, <4 x half> 
{{.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// LLVM:      [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> 
[[B_CAST:%.*]], <4 x half> [[LANE]], <4 x half> [[A_CAST:%.*]])
+// LLVM:      [[A_I:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x half> [[B]] to <4 x i16>
+// LLVM-NEXT: [[C_I:%.*]] = bitcast <4 x half> [[C]] to <4 x i16>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <4 x i16> [[A_I]] to <8 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <4 x i16> [[B_I]] to <8 x i8>
+// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <4 x i16> [[C_I]] to <8 x i8>
+// LLVM:      [[C_CAST:%.*]] = bitcast <8 x i8> [[C_BYTES]] to <4 x half>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[C_CAST]], <4 x half> 
{{.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// LLVM:      [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> {{.*}}, 
<4 x half> [[LANE]], <4 x half> {{.*}})
 // LLVM:      ret <4 x half> [[FMA]]
   return vfma_lane_f16(a, b, c, 3);
 }
@@ -86,7 +104,7 @@ float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b,
 // LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <4 x i16> [[C_I]] to <8 x i8>
 // LLVM:      [[C_CAST:%.*]] = bitcast <8 x i8> [[C_BYTES]] to <4 x half>
 // LLVM-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[C_CAST]], <4 x half> 
{{.*}}, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// LLVM:      [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> 
[[B_CAST:%.*]], <8 x half> [[LANE]], <8 x half> [[A_CAST:%.*]])
+// LLVM:      [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> {{.*}}, 
<8 x half> [[LANE]], <8 x half> {{.*}})
 // LLVM:      ret <8 x half> [[FMA]]
   return vfmaq_lane_f16(a, b, c, 3);
 }
@@ -134,3 +152,27 @@ float16x8_t test_vfmaq_laneq_f16(float16x8_t a, 
float16x8_t b,
 // LLVM:      ret <8 x half> [[FMA]]
   return vfmaq_laneq_f16(a, b, c, 7);
 }
+
+// ALL-LABEL: @test_vfmah_lane_f16(
+float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : 
!cir.vector<4 x !cir.f16>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : (!cir.f16, 
!cir.f16, !cir.f16) -> !cir.f16
+
+// LLVM-SAME: half {{.*}} [[A:%.*]], half {{.*}} [[B:%.*]], <4 x half> {{.*}} 
[[C:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = extractelement <4 x half> [[C]], i{{32|64}} 3
+// LLVM:      [[FMA:%.*]] = call half @llvm.fma.f16(half [[B]], half [[LANE]], 
half [[A]])
+// LLVM:      ret half [[FMA]]
+  return vfmah_lane_f16(a, b, c, 3);
+}
+
+// ALL-LABEL: @test_vfmah_laneq_f16(
+float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : 
!cir.vector<8 x !cir.f16>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : (!cir.f16, 
!cir.f16, !cir.f16) -> !cir.f16
+
+// LLVM-SAME: half {{.*}} [[A:%.*]], half {{.*}} [[B:%.*]], <8 x half> {{.*}} 
[[C:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = extractelement <8 x half> [[C]], i{{32|64}} 7
+// LLVM:      [[FMA:%.*]] = call half @llvm.fma.f16(half [[B]], half [[LANE]], 
half [[A]])
+// LLVM:      ret half [[FMA]]
+  return vfmah_laneq_f16(a, b, c, 7);
+}
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
index 8938958c05f38..ba1d79ce2c816 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -35,8 +35,17 @@ float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, 
float32x2_t c) {
 // CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>, !cir.vector<2 x 
!cir.float>) -> !cir.vector<2 x !cir.float>
 
 // LLVM-SAME: <2 x float> {{.*}} [[A:%.*]], <2 x float> {{.*}} [[B:%.*]], <2 x 
float> {{.*}} [[C:%.*]]) {{.*}} {
-// LLVM:      [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
[[B_CAST:%.*]], <2 x float> [[C_CAST:%.*]], <2 x float> [[A_CAST:%.*]])
-// LLVM:      ret <2 x float> [[FMA]]
+// LLVM:      [[A_I:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x float> [[B]] to <2 x i32>
+// LLVM-NEXT: [[C_I:%.*]] = bitcast <2 x float> [[C]] to <2 x i32>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i32> [[A_I]] to <8 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i32> [[B_I]] to <8 x i8>
+// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <2 x i32> [[C_I]] to <8 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <8 x i8> [[A_BYTES]] to <2 x float>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <8 x i8> [[B_BYTES]] to <2 x float>
+// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <8 x i8> [[C_BYTES]] to <2 x float>
+// LLVM-NEXT: [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
[[B_CAST]], <2 x float> [[C_CAST]], <2 x float> [[A_CAST]])
+// LLVM-NEXT: ret <2 x float> [[FMA]]
   return vfma_f32(a, b, c);
 }
 
@@ -46,8 +55,20 @@ float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, 
float64x1_t c) {
 // CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<1 x !cir.double>, !cir.vector<1 x !cir.double>, !cir.vector<1 x 
!cir.double>) -> !cir.vector<1 x !cir.double>
 
 // LLVM-SAME: <1 x double> {{.*}} [[A:%.*]], <1 x double> {{.*}} [[B:%.*]], <1 
x double> {{.*}} [[C:%.*]]) {{.*}} {
-// LLVM:      [[FMA:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> 
[[B_CAST:%.*]], <1 x double> [[C_CAST:%.*]], <1 x double> [[A_CAST:%.*]])
-// LLVM:      ret <1 x double> [[FMA]]
+// LLVM:      [[A_I:%.*]] = bitcast <1 x double> [[A]] to i64
+// LLVM-NEXT: [[A_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[A_I]], 
i32 0
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <1 x double> [[B]] to i64
+// LLVM-NEXT: [[B_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[B_I]], 
i32 0
+// LLVM-NEXT: [[C_I:%.*]] = bitcast <1 x double> [[C]] to i64
+// LLVM-NEXT: [[C_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[C_I]], 
i32 0
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <1 x i64> [[A_INSERT]] to <8 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <1 x i64> [[B_INSERT]] to <8 x i8>
+// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <1 x i64> [[C_INSERT]] to <8 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <8 x i8> [[A_BYTES]] to <1 x double>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <8 x i8> [[B_BYTES]] to <1 x double>
+// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <8 x i8> [[C_BYTES]] to <1 x double>
+// LLVM-NEXT: [[FMA:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> 
[[B_CAST]], <1 x double> [[C_CAST]], <1 x double> [[A_CAST]])
+// LLVM-NEXT: ret <1 x double> [[FMA]]
   return vfma_f64(a, b, c);
 }
 
@@ -97,8 +118,15 @@ float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t 
b, float32x2_t v) {
 // CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>, !cir.vector<2 x 
!cir.float>) -> !cir.vector<2 x !cir.float>
 
 // LLVM-SAME: <2 x float> {{.*}} [[A:%.*]], <2 x float> {{.*}} [[B:%.*]], <2 x 
float> {{.*}} [[V:%.*]]) {{.*}} {
-// LLVM:      [[LANE:%.*]] = shufflevector <2 x float> {{.*}}, <2 x float> 
{{.*}}, <2 x i32> <i32 1, i32 1>
-// LLVM:      [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
[[B_CAST:%.*]], <2 x float> [[LANE]], <2 x float> [[A_CAST:%.*]])
+// LLVM:      [[A_I:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x float> [[B]] to <2 x i32>
+// LLVM-NEXT: [[V_I:%.*]] = bitcast <2 x float> [[V]] to <2 x i32>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i32> [[A_I]] to <8 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i32> [[B_I]] to <8 x i8>
+// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <2 x i32> [[V_I]] to <8 x i8>
+// LLVM:      [[V_CAST:%.*]] = bitcast <8 x i8> [[V_BYTES]] to <2 x float>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V_CAST]], <2 x float> 
{{.*}}, <2 x i32> <i32 1, i32 1>
+// LLVM:      [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
{{.*}}, <2 x float> [[LANE]], <2 x float> {{.*}})
 // LLVM:      ret <2 x float> [[FMA]]
   return vfma_lane_f32(a, b, v, 1);
 }
@@ -109,8 +137,18 @@ float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t 
b, float64x1_t v) {
 // CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<1 x !cir.double>, !cir.vector<1 x !cir.double>, !cir.vector<1 x 
!cir.double>) -> !cir.vector<1 x !cir.double>
 
 // LLVM-SAME: <1 x double> {{.*}} [[A:%.*]], <1 x double> {{.*}} [[B:%.*]], <1 
x double> {{.*}} [[V:%.*]]) {{.*}} {
-// LLVM:      [[LANE:%.*]] = shufflevector <1 x double> {{.*}}, <1 x double> 
{{.*}}, <1 x i32> zeroinitializer
-// LLVM:      [[FMA:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> 
[[B_CAST:%.*]], <1 x double> [[LANE]], <1 x double> [[A_CAST:%.*]])
+// LLVM:      [[A_I:%.*]] = bitcast <1 x double> [[A]] to i64
+// LLVM-NEXT: [[A_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[A_I]], 
i32 0
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <1 x double> [[B]] to i64
+// LLVM-NEXT: [[B_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[B_I]], 
i32 0
+// LLVM-NEXT: [[V_I:%.*]] = bitcast <1 x double> [[V]] to i64
+// LLVM-NEXT: [[V_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[V_I]], 
i32 0
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <1 x i64> [[A_INSERT]] to <8 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <1 x i64> [[B_INSERT]] to <8 x i8>
+// LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <1 x i64> [[V_INSERT]] to <8 x i8>
+// LLVM:      [[V_CAST:%.*]] = bitcast <8 x i8> [[V_BYTES]] to <1 x double>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[V_CAST]], <1 x 
double> {{.*}}, <1 x i32> zeroinitializer
+// LLVM:      [[FMA:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> 
{{.*}}, <1 x double> [[LANE]], <1 x double> {{.*}})
 // LLVM:      ret <1 x double> [[FMA]]
   return vfma_lane_f64(a, b, v, 0);
 }
@@ -129,7 +167,7 @@ float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t 
b, float32x2_t v) {
 // LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <2 x i32> [[V_I]] to <8 x i8>
 // LLVM:      [[V_CAST:%.*]] = bitcast <8 x i8> [[V_BYTES]] to <2 x float>
 // LLVM-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V_CAST]], <2 x float> 
{{.*}}, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// LLVM:      [[FMA:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> 
[[B_CAST:%.*]], <4 x float> [[LANE]], <4 x float> [[A_CAST:%.*]])
+// LLVM:      [[FMA:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> 
{{.*}}, <4 x float> [[LANE]], <4 x float> {{.*}})
 // LLVM:      ret <4 x float> [[FMA]]
   return vfmaq_lane_f32(a, b, v, 1);
 }
@@ -149,7 +187,7 @@ float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t 
b, float64x1_t v) {
 // LLVM-NEXT: [[V_BYTES:%.*]] = bitcast <1 x i64> [[V_INSERT]] to <8 x i8>
 // LLVM:      [[V_CAST:%.*]] = bitcast <8 x i8> [[V_BYTES]] to <1 x double>
 // LLVM-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[V_CAST]], <1 x 
double> {{.*}}, <2 x i32> zeroinitializer
-// LLVM:      [[FMA:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> 
[[B_CAST:%.*]], <2 x double> [[LANE]], <2 x double> [[A_CAST:%.*]])
+// LLVM:      [[FMA:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> 
{{.*}}, <2 x double> [[LANE]], <2 x double> {{.*}})
 // LLVM:      ret <2 x double> [[FMA]]
   return vfmaq_lane_f64(a, b, v, 0);
 }

>From 2ca583996520372d988f6d98a0a1f2350afaa87c Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Thu, 25 Jun 2026 08:10:29 +0300
Subject: [PATCH 6/7] [CIR][AArch64] Remove duplicate vfmah tests

Drop the non-constrained vfmah_lane_f16 and vfmah_laneq_f16 checks
from v8.2a-neon-intrinsics.c now that the Neon fused-multiple-fullfp16
test covers them with direct LLVM, CIR-to-LLVM, and CIR checks.

Keep the constrained variants in v8.2a-neon-intrinsics-constrained.c.
---
 .../CodeGen/AArch64/v8.2a-neon-intrinsics.c   | 22 -------------------
 1 file changed, 22 deletions(-)

diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c 
b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
index a2d6c9d0bff89..a58ade4c4c29f 100644
--- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
@@ -1640,28 +1640,6 @@ float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t 
b, float16_t c) {
   return vfmaq_n_f16(a, b, c);
 }
 
-// CHECK-LABEL: define {{[^@]+}}@test_vfmah_lane_f16
-// CHECK-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> 
noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half 
[[EXTRACT]], half [[A]])
-// CHECK-NEXT:    ret half [[TMP0]]
-//
-float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) {
-  return vfmah_lane_f16(a, b, c, 3);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vfmah_laneq_f16
-// CHECK-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> 
noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i32 7
-// CHECK-NEXT:    [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half 
[[EXTRACT]], half [[A]])
-// CHECK-NEXT:    ret half [[TMP0]]
-//
-float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
-  return vfmah_laneq_f16(a, b, c, 7);
-}
-
 // CHECK-LABEL: define {{[^@]+}}@test_vfms_lane_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:

>From c7309867c883e59c34af41389748f0bfe73c73d0 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Fri, 26 Jun 2026 07:36:19 +0300
Subject: [PATCH 7/7] [CIR][AArch64] Lower vfmad_lane_f64

Share the scalar double laneq FMA lowering with vfmad_lane_f64.

Add CIR coverage in fused-multiply.c.

Remove the moved non-constrained legacy test.

Constrained coverage remains in its existing file.
---
 clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp    |  8 ++------
 .../CodeGen/AArch64/neon-scalar-x-indexed-elem.c  | 11 -----------
 clang/test/CodeGen/AArch64/neon/fused-multiply.c  | 15 ++++++++++++++-
 3 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 8c291abcd9681..cf3ae484ef92c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2746,13 +2746,9 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
         builder, loc, "fma", convertType(expr->getType()), fmaOps);
   }
   case NEON::BI__builtin_neon_vfmad_lane_f64:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented AArch64 builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinID));
-    return mlir::Value{};
   case NEON::BI__builtin_neon_vfmad_laneq_f64: {
-    // The laneq source operand is float64x2_t, so the source vector has two
-    // double lanes.
+    // The lane source operand is float64x1_t for lane forms and float64x2_t
+    // for laneq forms.
     mlir::Value laneSource = builder.createExtractElement(
         loc, ops[2], static_cast<uint64_t>(getIntValueFromConstOp(ops[3])));
 
diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c 
b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
index 59be5ac738278..f701fbf4132e8 100644
--- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
+++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
@@ -148,17 +148,6 @@ float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, 
float64x2_t b) {
 }
 
 
-// CHECK-LABEL: define dso_local double @test_vfmad_lane_f64(
-// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <1 x 
double> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <1 x double> [[C]], i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], 
double [[EXTRACT]], double [[A]])
-// CHECK-NEXT:    ret double [[TMP0]]
-//
-float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
-  return vfmad_lane_f64(a, b, c, 0);
-}
-
 // CHECK-LABEL: define dso_local float @test_vfmss_lane_f32(
 // CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> 
noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
index ba1d79ce2c816..b3fbc2bfbc721 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -16,7 +16,8 @@
 // This file currently covers the f32/f64 wrappers that lower through
 // BI__builtin_neon_vfmaq_v, BI__builtin_neon_vfmaq_lane_v,
 // BI__builtin_neon_vfma_laneq_v, BI__builtin_neon_vfmaq_laneq_v,
-// and BI__builtin_neon_vfmad_laneq_f64.
+// BI__builtin_neon_vfmad_lane_f64, and
+// BI__builtin_neon_vfmad_laneq_f64.
 //
 // ACLE section headings based on v2025Q2 of the ACLE specification:
 //  * 
https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate
@@ -372,6 +373,18 @@ float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, 
float32x4_t c) {
   return vfmas_laneq_f32(a, b, c, 3);
 }
 
+// ALL-LABEL: @test_vfmad_lane_f64(
+float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : 
!cir.vector<1 x !cir.double>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.double, !cir.double, !cir.double) -> !cir.double
+
+// LLVM-SAME: double {{.*}} [[A:%.*]], double {{.*}} [[B:%.*]], <1 x double> 
{{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = extractelement <1 x double> [[C]], i{{32|64}} 0
+// LLVM:      [[FMA:%.*]] = call double @llvm.fma.f64(double [[B]], double 
[[LANE]], double [[A]])
+// LLVM:      ret double [[FMA]]
+  return vfmad_lane_f64(a, b, c, 0);
+}
+
 // ALL-LABEL: @test_vfmad_laneq_f64(
 float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
 // CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : 
!cir.vector<2 x !cir.double>

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [CIR][AArch64] Lower vfma and scalar FMA lane builtins (PR #204819)

Reply via email to