https://github.com/yairbenavraham created 
https://github.com/llvm/llvm-project/pull/204819

Lower additional AArch64 NEON fused multiply-accumulate builtins in CIR.

This covers:
  - BI__builtin_neon_vfma_v: vfma_f16, vfma_f32, vfma_f64
  - BI__builtin_neon_vfma_lane_v: vfma_lane_f16, vfma_lane_f32, vfma_lane_f64
  - BI__builtin_neon_vfmas_lane_f32: vfmas_lane_f32
  - BI__builtin_neon_vfmas_laneq_f32: vfmas_laneq_f32

vfma_v and vfma_lane_v reuse the corresponding quad FMA lowering paths. The 
lane forms splat the selected vector element, while the scalar forms extract 
the selected lane before emitting llvm.fma.

Add direct LLVM, CIR-to-LLVM, and CIR coverage for every listed ACLE wrapper in 
the existing fused-multiply tests.

Part of #185382
Follow-up to #202337

>From 99e5bf56f0d2df026c9fd0245d7dede0c90c1fec Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Tue, 16 Jun 2026 14:25:51 +0300
Subject: [PATCH 1/3] [CIR][AArch64] Lower vfma_v builtin

Lower BI__builtin_neon_vfma_v through the existing vfmaq_v FMA path.

Add f16, f32, and f64 coverage for the non-quad vfma_* ACLE wrappers.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  |  6 ++---
 .../AArch64/neon/fused-multiple-fullfp16.c    | 11 ++++++++++
 .../CodeGen/AArch64/neon/fused-multiply.c     | 22 +++++++++++++++++++
 3 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index b52e978fa0f4d..e57abe24a6c3f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -837,15 +837,15 @@ static mlir::Value emitCommonNeonBuiltinExpr(
   case NEON::BI__builtin_neon_vcvtx_f32_v:
   case NEON::BI__builtin_neon_vext_v:
   case NEON::BI__builtin_neon_vextq_v:
-  case NEON::BI__builtin_neon_vfma_v:
     cgf.cgm.errorNYI(expr->getSourceRange(),
                      std::string("unimplemented AArch64 builtin call: ") +
                          ctx.BuiltinInfo.getName(builtinID));
     return mlir::Value{};
+  case NEON::BI__builtin_neon_vfma_v:
   case NEON::BI__builtin_neon_vfmaq_v: {
-    // NEON intrinsic: vfmaq(accumulator, multiplicand1, multiplicand2)
+    // NEON intrinsic: vfma(q)(accumulator, multiplicand1, multiplicand2)
     // LLVM intrinsic: fma(multiplicand1, multiplicand2, accumulator)
-    // Reorder arguments to match LLVM fma signature
+    // Reorder arguments to match LLVM fma signature.
     mlir::Value op0 = cgf.getBuilder().createBitcast(ops[0], ty);
     mlir::Value op1 = cgf.getBuilder().createBitcast(ops[1], ty);
     mlir::Value op2 = cgf.getBuilder().createBitcast(ops[2], ty);
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
index 1460fb3b2bae1..f9ddb27dc6be1 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
@@ -27,6 +27,17 @@
 // 2.6.1.9.3 Fused multiply-accumulate, vector quad forms
 //===------------------------------------------------------===//
 
+// LLVM-LABEL: @test_vfma_f16(
+// CIR-LABEL: @vfma_f16(
+float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<4 x !cir.f16>, !cir.vector<4 x !cir.f16>, !cir.vector<4 x 
!cir.f16>) -> !cir.vector<4 x !cir.f16>
+
+// LLVM-SAME: <4 x half> {{.*}} [[A:%.*]], <4 x half> {{.*}} [[B:%.*]], <4 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> {{.*}}, 
<4 x half> {{.*}}, <4 x half> {{.*}})
+// LLVM:      ret <4 x half> [[FMA]]
+  return vfma_f16(a, b, c);
+}
+
 // LLVM-LABEL: @test_vfmaq_f16(
 // CIR-LABEL: @vfmaq_f16(
 float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
index 4e30b063064aa..4b74a8b7605c1 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -29,6 +29,28 @@
 // 2.1.1.2.5 Fused multiply-accumulate, vector quad forms
 //===------------------------------------------------------===//
 
+// LLVM-LABEL: @test_vfma_f32(
+// CIR-LABEL: @vfma_f32(
+float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>, !cir.vector<2 x 
!cir.float>) -> !cir.vector<2 x !cir.float>
+
+// LLVM-SAME: <2 x float> {{.*}} [[A:%.*]], <2 x float> {{.*}} [[B:%.*]], <2 x 
float> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
{{.*}}, <2 x float> {{.*}}, <2 x float> {{.*}})
+// LLVM:      ret <2 x float> [[FMA]]
+  return vfma_f32(a, b, c);
+}
+
+// LLVM-LABEL: @test_vfma_f64(
+// CIR-LABEL: @vfma_f64(
+float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<1 x !cir.double>, !cir.vector<1 x !cir.double>, !cir.vector<1 x 
!cir.double>) -> !cir.vector<1 x !cir.double>
+
+// LLVM-SAME: <1 x double> {{.*}} [[A:%.*]], <1 x double> {{.*}} [[B:%.*]], <1 
x double> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[FMA:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> 
{{.*}}, <1 x double> {{.*}}, <1 x double> {{.*}})
+// LLVM:      ret <1 x double> [[FMA]]
+  return vfma_f64(a, b, c);
+}
+
 // LLVM-LABEL: @test_vfmaq_f32(
 // CIR-LABEL: @vfmaq_f32(
 float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {

>From 7dbe4c528638c788fa96f5ff12f78c9da047d0a1 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Tue, 16 Jun 2026 14:40:53 +0300
Subject: [PATCH 2/3] [CIR][AArch64] Lower vfma_lane_v builtin

Lower BI__builtin_neon_vfma_lane_v through the existing vfmaq_lane_v path.

Add f16, f32, and f64 coverage for the non-quad vfma_lane_* ACLE wrappers.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 15 +++--
 .../AArch64/neon/fused-multiple-fullfp16.c    | 57 ++++++++++++-------
 .../CodeGen/AArch64/neon/fused-multiply.c     | 24 ++++++++
 3 files changed, 66 insertions(+), 30 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index e57abe24a6c3f..b0da33d271f6e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2744,17 +2744,16 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return builder.createBitcast(ops[0], ty);
   }
   case NEON::BI__builtin_neon_vfma_lane_v:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented AArch64 builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinID));
-    return mlir::Value{};
   case NEON::BI__builtin_neon_vfmaq_lane_v: {
     mlir::Value addend = builder.createBitcast(ops[0], ty);
     mlir::Value multiplicand = builder.createBitcast(ops[1], ty);
-    // The lane source operand is the non-quad vector, so it has half as many
-    // lanes as the quad result vector.
-    cir::VectorType sourceTy =
-        cir::VectorType::get(ty.getElementType(), ty.getSize() / 2);
+    // For vfmaq_lane, the lane source operand is the non-quad vector, so it 
has
+    // half as many lanes as the quad result vector. For vfma_lane, it has the
+    // same shape as the result vector.
+    cir::VectorType sourceTy = cir::VectorType::get(
+        ty.getElementType(), builtinID == NEON::BI__builtin_neon_vfmaq_lane_v
+                                 ? ty.getSize() / 2
+                                 : ty.getSize());
     mlir::Value laneSource = builder.createBitcast(ops[2], sourceTy);
     laneSource = emitNeonSplat(builder, loc, laneSource, ops[3], ty.getSize());
 
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
index f9ddb27dc6be1..cd929ac88a1a3 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
@@ -58,6 +58,19 @@ float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, 
float16x8_t c) {
   return vfmaq_f16(a, b, c);
 }
 
+// ALL-LABEL: @test_vfma_lane_f16(
+float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b,
+                                float16x4_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.f16>) [#cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i, 
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.f16>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<4 x !cir.f16>, !cir.vector<4 x !cir.f16>, !cir.vector<4 x 
!cir.f16>) -> !cir.vector<4 x !cir.f16>
+
+// LLVM-SAME: <4 x half> {{.*}} [[A:%.*]], <4 x half> {{.*}} [[B:%.*]], <4 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = shufflevector <4 x half> {{.*}}, <4 x half> 
{{.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// LLVM:      [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> {{.*}}, 
<4 x half> [[LANE]], <4 x half> {{.*}})
+// LLVM:      ret <4 x half> [[FMA]]
+  return vfma_lane_f16(a, b, c, 3);
+}
+
 // ALL-LABEL: @test_vfmaq_lane_f16(
 float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b,
                                  float16x4_t c) {
@@ -78,28 +91,6 @@ float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b,
   return vfmaq_lane_f16(a, b, c, 3);
 }
 
-// ALL-LABEL: @test_vfmaq_laneq_f16(
-float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b,
-                                  float16x8_t c) {
-// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.f16>) [#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, 
#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : 
!s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.f16>
-// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : 
(!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x 
!cir.f16>) -> !cir.vector<8 x !cir.f16>
-
-// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <8 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
-// LLVM:      [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
-// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
-// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
-// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8>
-// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8>
-// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8>
-// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <8 x half>
-// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <8 x half>
-// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half>
-// LLVM-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C_CAST]], <8 x half> 
{{.*}}, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// LLVM-NEXT: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> 
[[LANE]], <8 x half> [[B_CAST]], <8 x half> [[A_CAST]])
-// LLVM:      ret <8 x half> [[FMA]]
-  return vfmaq_laneq_f16(a, b, c, 7);
-}
-
 // ALL-LABEL: @test_vfma_laneq_f16(
 float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b,
                                  float16x8_t c) {
@@ -121,3 +112,25 @@ float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t 
b,
 // LLVM:      ret <4 x half> [[FMA]]
   return vfma_laneq_f16(a, b, c, 7);
 }
+
+// ALL-LABEL: @test_vfmaq_laneq_f16(
+float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b,
+                                  float16x8_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.f16>) [#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, 
#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : 
!s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.f16>
+// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : 
(!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x 
!cir.f16>) -> !cir.vector<8 x !cir.f16>
+
+// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <8 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
+// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <8 x half>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <8 x half>
+// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C_CAST]], <8 x half> 
{{.*}}, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// LLVM-NEXT: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> 
[[LANE]], <8 x half> [[B_CAST]], <8 x half> [[A_CAST]])
+// LLVM:      ret <8 x half> [[FMA]]
+  return vfmaq_laneq_f16(a, b, c, 7);
+}
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
index 4b74a8b7605c1..5f6d496316799 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -91,6 +91,30 @@ float64x2_t test_vfmaq_f64(float64x2_t a, float64x2_t b, 
float64x2_t c) {
   return vfmaq_f64(a, b, c);
 }
 
+// ALL-LABEL: @test_vfma_lane_f32(
+float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x 
!cir.float>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x 
!cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>, !cir.vector<2 x 
!cir.float>) -> !cir.vector<2 x !cir.float>
+
+// LLVM-SAME: <2 x float> {{.*}} [[A:%.*]], <2 x float> {{.*}} [[B:%.*]], <2 x 
float> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = shufflevector <2 x float> {{.*}}, <2 x float> 
{{.*}}, <2 x i32> <i32 1, i32 1>
+// LLVM:      [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
{{.*}}, <2 x float> [[LANE]], <2 x float> {{.*}})
+// LLVM:      ret <2 x float> [[FMA]]
+  return vfma_lane_f32(a, b, v, 1);
+}
+
+// ALL-LABEL: @test_vfma_lane_f64(
+float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<1 x 
!cir.double>) [#cir.int<0> : !s32i] : !cir.vector<1 x !cir.double>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<1 x !cir.double>, !cir.vector<1 x !cir.double>, !cir.vector<1 x 
!cir.double>) -> !cir.vector<1 x !cir.double>
+
+// LLVM-SAME: <1 x double> {{.*}} [[A:%.*]], <1 x double> {{.*}} [[B:%.*]], <1 
x double> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = shufflevector <1 x double> {{.*}}, <1 x double> 
{{.*}}, <1 x i32> zeroinitializer
+// LLVM:      [[FMA:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> 
{{.*}}, <1 x double> [[LANE]], <1 x double> {{.*}})
+// LLVM:      ret <1 x double> [[FMA]]
+  return vfma_lane_f64(a, b, v, 0);
+}
+
 // ALL-LABEL: @test_vfmaq_lane_f32(
 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
 // CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x 
!cir.float>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, 
#cir.int<1> : !s32i] : !cir.vector<4 x !cir.float>

>From 121b22be7c0a51c8ebc8f2ee321f24f003d12c27 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Tue, 16 Jun 2026 15:08:57 +0300
Subject: [PATCH 3/3] [CIR][AArch64] Lower vfmas lane builtins

Lower BI__builtin_neon_vfmas_lane_f32 and
BI__builtin_neon_vfmas_laneq_f32 by extracting the selected lane and
emitting llvm.fma.

Add scalar f32 lane and laneq coverage.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 28 +++++++++++++++++--
 .../CodeGen/AArch64/neon/fused-multiply.c     | 24 ++++++++++++++++
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index b0da33d271f6e..8be4d567cefc5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2803,9 +2803,33 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps);
   }
   case NEON::BI__builtin_neon_vfmah_lane_f16:
-  case NEON::BI__builtin_neon_vfmas_lane_f32:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
+  case NEON::BI__builtin_neon_vfmas_lane_f32: {
+    // Scalar lane/laneq forms use one selected element from the lane source.
+    mlir::Value laneSource = builder.createExtractElement(
+        loc, ops[2], static_cast<uint64_t>(getIntValueFromConstOp(ops[3])));
+
+    llvm::SmallVector<mlir::Value> fmaOps = {ops[1], laneSource, ops[0]};
+    return emitCallMaybeConstrainedBuiltin(
+        builder, loc, "fma", convertType(expr->getType()), fmaOps);
+  }
   case NEON::BI__builtin_neon_vfmah_laneq_f16:
-  case NEON::BI__builtin_neon_vfmas_laneq_f32:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
+  case NEON::BI__builtin_neon_vfmas_laneq_f32: {
+    // Scalar lane/laneq forms use one selected element from the lane source.
+    mlir::Value laneSource = builder.createExtractElement(
+        loc, ops[2], static_cast<uint64_t>(getIntValueFromConstOp(ops[3])));
+
+    llvm::SmallVector<mlir::Value> fmaOps = {ops[1], laneSource, ops[0]};
+    return emitCallMaybeConstrainedBuiltin(
+        builder, loc, "fma", convertType(expr->getType()), fmaOps);
+  }
   case NEON::BI__builtin_neon_vfmad_lane_f64:
     cgm.errorNYI(expr->getSourceRange(),
                  std::string("unimplemented AArch64 builtin call: ") +
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
index 5f6d496316799..f90e9642c64fe 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -310,6 +310,30 @@ float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, 
float64x2_t b,
   return vfmaq_laneq_f64(a, b, v, 0);
 }
 
+// ALL-LABEL: @test_vfmas_lane_f32(
+float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : 
!cir.vector<2 x !cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.float, !cir.float, !cir.float) -> !cir.float
+
+// LLVM-SAME: float {{.*}} [[A:%.*]], float {{.*}} [[B:%.*]], <2 x float> 
{{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = extractelement <2 x float> [[C]], i{{32|64}} 1
+// LLVM:      [[FMA:%.*]] = call float @llvm.fma.f32(float [[B]], float 
[[LANE]], float [[A]])
+// LLVM:      ret float [[FMA]]
+  return vfmas_lane_f32(a, b, c, 1);
+}
+
+// ALL-LABEL: @test_vfmas_laneq_f32(
+float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : 
!cir.vector<4 x !cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.float, !cir.float, !cir.float) -> !cir.float
+
+// LLVM-SAME: float {{.*}} [[A:%.*]], float {{.*}} [[B:%.*]], <4 x float> 
{{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = extractelement <4 x float> [[C]], i{{32|64}} 3
+// LLVM:      [[FMA:%.*]] = call float @llvm.fma.f32(float [[B]], float 
[[LANE]], float [[A]])
+// LLVM:      ret float [[FMA]]
+  return vfmas_laneq_f32(a, b, c, 3);
+}
+
 // ALL-LABEL: @test_vfmad_laneq_f64(
 float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
 // CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : 
!cir.vector<2 x !cir.double>

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to