[clang] [CIR][AArch64] Lower vfma_v, vfma_lane_v, and vfmas lane builtins (PR #204819)

Yair Ben Avraham via cfe-commits Tue, 23 Jun 2026 09:42:34 -0700

https://github.com/yairbenavraham updated 
https://github.com/llvm/llvm-project/pull/204819


>From 1428e169ec0a399ab0eb122d98983f2450567d39 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Tue, 16 Jun 2026 14:25:51 +0300
Subject: [PATCH 1/4] [CIR][AArch64] Lower vfma_v builtin

Lower BI__builtin_neon_vfma_v through the existing vfmaq_v FMA path.

Add f16, f32, and f64 coverage for the non-quad vfma_* ACLE wrappers.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  |  6 ++---
 .../AArch64/neon/fused-multiple-fullfp16.c    | 11 ++++++++++
 .../CodeGen/AArch64/neon/fused-multiply.c     | 22 +++++++++++++++++++
 3 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 8b077620d2bab..3900e02472e75 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -837,15 +837,15 @@ static mlir::Value emitCommonNeonBuiltinExpr(
   case NEON::BI__builtin_neon_vcvtx_f32_v:
   case NEON::BI__builtin_neon_vext_v:
   case NEON::BI__builtin_neon_vextq_v:
-  case NEON::BI__builtin_neon_vfma_v:
     cgf.cgm.errorNYI(expr->getSourceRange(),
                      std::string("unimplemented AArch64 builtin call: ") +
                          ctx.BuiltinInfo.getName(builtinID));
     return mlir::Value{};
+  case NEON::BI__builtin_neon_vfma_v:
   case NEON::BI__builtin_neon_vfmaq_v: {
-    // NEON intrinsic: vfmaq(accumulator, multiplicand1, multiplicand2)
+    // NEON intrinsic: vfma(q)(accumulator, multiplicand1, multiplicand2)
     // LLVM intrinsic: fma(multiplicand1, multiplicand2, accumulator)
-    // Reorder arguments to match LLVM fma signature
+    // Reorder arguments to match LLVM fma signature.
     mlir::Value op0 = cgf.getBuilder().createBitcast(ops[0], ty);
     mlir::Value op1 = cgf.getBuilder().createBitcast(ops[1], ty);
     mlir::Value op2 = cgf.getBuilder().createBitcast(ops[2], ty);
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
index 1460fb3b2bae1..25c7214d68ab7 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
@@ -27,6 +27,17 @@
 // 2.6.1.9.3 Fused multiply-accumulate, vector quad forms
 //===------------------------------------------------------===//
 
+// LLVM-LABEL: @test_vfma_f16(
+// CIR-LABEL: @vfma_f16(
+float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<4 x !cir.f16>, !cir.vector<4 x !cir.f16>, !cir.vector<4 x 
!cir.f16>) -> !cir.vector<4 x !cir.f16>
+
+// LLVM-SAME: <4 x half> {{.*}} [[A:%.*]], <4 x half> {{.*}} [[B:%.*]], <4 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> 
[[B_CAST:%.*]], <4 x half> [[C_CAST:%.*]], <4 x half> [[A_CAST:%.*]])
+// LLVM:      ret <4 x half> [[FMA]]
+  return vfma_f16(a, b, c);
+}
+
 // LLVM-LABEL: @test_vfmaq_f16(
 // CIR-LABEL: @vfmaq_f16(
 float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
index 4e30b063064aa..06be15486463e 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -29,6 +29,28 @@
 // 2.1.1.2.5 Fused multiply-accumulate, vector quad forms
 //===------------------------------------------------------===//
 
+// LLVM-LABEL: @test_vfma_f32(
+// CIR-LABEL: @vfma_f32(
+float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>, !cir.vector<2 x 
!cir.float>) -> !cir.vector<2 x !cir.float>
+
+// LLVM-SAME: <2 x float> {{.*}} [[A:%.*]], <2 x float> {{.*}} [[B:%.*]], <2 x 
float> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
[[B_CAST:%.*]], <2 x float> [[C_CAST:%.*]], <2 x float> [[A_CAST:%.*]])
+// LLVM:      ret <2 x float> [[FMA]]
+  return vfma_f32(a, b, c);
+}
+
+// LLVM-LABEL: @test_vfma_f64(
+// CIR-LABEL: @vfma_f64(
+float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<1 x !cir.double>, !cir.vector<1 x !cir.double>, !cir.vector<1 x 
!cir.double>) -> !cir.vector<1 x !cir.double>
+
+// LLVM-SAME: <1 x double> {{.*}} [[A:%.*]], <1 x double> {{.*}} [[B:%.*]], <1 
x double> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[FMA:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> 
[[B_CAST:%.*]], <1 x double> [[C_CAST:%.*]], <1 x double> [[A_CAST:%.*]])
+// LLVM:      ret <1 x double> [[FMA]]
+  return vfma_f64(a, b, c);
+}
+
 // LLVM-LABEL: @test_vfmaq_f32(
 // CIR-LABEL: @vfmaq_f32(
 float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {

>From 526cd1e6e7b54f2e022f25dc4a03add9719d89e9 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Tue, 16 Jun 2026 14:40:53 +0300
Subject: [PATCH 2/4] [CIR][AArch64] Lower vfma_lane_v builtin

Lower BI__builtin_neon_vfma_lane_v through the existing vfmaq_lane_v path.

Add f16, f32, and f64 coverage for the non-quad vfma_lane_* ACLE wrappers.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 15 +++--
 .../AArch64/neon/fused-multiple-fullfp16.c    | 57 ++++++++++++-------
 .../CodeGen/AArch64/neon/fused-multiply.c     | 24 ++++++++
 3 files changed, 66 insertions(+), 30 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 3900e02472e75..a692642ee7306 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2678,17 +2678,16 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return builder.createBitcast(ops[0], ty);
   }
   case NEON::BI__builtin_neon_vfma_lane_v:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented AArch64 builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinID));
-    return mlir::Value{};
   case NEON::BI__builtin_neon_vfmaq_lane_v: {
     mlir::Value addend = builder.createBitcast(ops[0], ty);
     mlir::Value multiplicand = builder.createBitcast(ops[1], ty);
-    // The lane source operand is the non-quad vector, so it has half as many
-    // lanes as the quad result vector.
-    cir::VectorType sourceTy =
-        cir::VectorType::get(ty.getElementType(), ty.getSize() / 2);
+    // For vfmaq_lane, the lane source operand is the non-quad vector, so it 
has
+    // half as many lanes as the quad result vector. For vfma_lane, it has the
+    // same shape as the result vector.
+    cir::VectorType sourceTy = cir::VectorType::get(
+        ty.getElementType(), builtinID == NEON::BI__builtin_neon_vfmaq_lane_v
+                                 ? ty.getSize() / 2
+                                 : ty.getSize());
     mlir::Value laneSource = builder.createBitcast(ops[2], sourceTy);
     laneSource = emitNeonSplat(builder, loc, laneSource, ops[3], ty.getSize());
 
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
index 25c7214d68ab7..30a0eb148e1b1 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
@@ -58,6 +58,19 @@ float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, 
float16x8_t c) {
   return vfmaq_f16(a, b, c);
 }
 
+// ALL-LABEL: @test_vfma_lane_f16(
+float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b,
+                                float16x4_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.f16>) [#cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i, 
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.f16>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<4 x !cir.f16>, !cir.vector<4 x !cir.f16>, !cir.vector<4 x 
!cir.f16>) -> !cir.vector<4 x !cir.f16>
+
+// LLVM-SAME: <4 x half> {{.*}} [[A:%.*]], <4 x half> {{.*}} [[B:%.*]], <4 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = shufflevector <4 x half> {{.*}}, <4 x half> 
{{.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// LLVM:      [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> 
[[B_CAST:%.*]], <4 x half> [[LANE]], <4 x half> [[A_CAST:%.*]])
+// LLVM:      ret <4 x half> [[FMA]]
+  return vfma_lane_f16(a, b, c, 3);
+}
+
 // ALL-LABEL: @test_vfmaq_lane_f16(
 float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b,
                                  float16x4_t c) {
@@ -78,28 +91,6 @@ float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b,
   return vfmaq_lane_f16(a, b, c, 3);
 }
 
-// ALL-LABEL: @test_vfmaq_laneq_f16(
-float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b,
-                                  float16x8_t c) {
-// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.f16>) [#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, 
#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : 
!s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.f16>
-// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : 
(!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x 
!cir.f16>) -> !cir.vector<8 x !cir.f16>
-
-// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <8 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
-// LLVM:      [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
-// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
-// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
-// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8>
-// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8>
-// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8>
-// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <8 x half>
-// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <8 x half>
-// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half>
-// LLVM-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C_CAST]], <8 x half> 
{{.*}}, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// LLVM-NEXT: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> 
[[LANE]], <8 x half> [[B_CAST]], <8 x half> [[A_CAST]])
-// LLVM:      ret <8 x half> [[FMA]]
-  return vfmaq_laneq_f16(a, b, c, 7);
-}
-
 // ALL-LABEL: @test_vfma_laneq_f16(
 float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b,
                                  float16x8_t c) {
@@ -121,3 +112,25 @@ float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t 
b,
 // LLVM:      ret <4 x half> [[FMA]]
   return vfma_laneq_f16(a, b, c, 7);
 }
+
+// ALL-LABEL: @test_vfmaq_laneq_f16(
+float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b,
+                                  float16x8_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.f16>) [#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, 
#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : 
!s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.f16>
+// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : 
(!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x 
!cir.f16>) -> !cir.vector<8 x !cir.f16>
+
+// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <8 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
+// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <8 x half>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <8 x half>
+// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C_CAST]], <8 x half> 
{{.*}}, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// LLVM-NEXT: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> 
[[LANE]], <8 x half> [[B_CAST]], <8 x half> [[A_CAST]])
+// LLVM:      ret <8 x half> [[FMA]]
+  return vfmaq_laneq_f16(a, b, c, 7);
+}
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
index 06be15486463e..8347038cea0db 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -91,6 +91,30 @@ float64x2_t test_vfmaq_f64(float64x2_t a, float64x2_t b, 
float64x2_t c) {
   return vfmaq_f64(a, b, c);
 }
 
+// ALL-LABEL: @test_vfma_lane_f32(
+float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x 
!cir.float>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x 
!cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>, !cir.vector<2 x 
!cir.float>) -> !cir.vector<2 x !cir.float>
+
+// LLVM-SAME: <2 x float> {{.*}} [[A:%.*]], <2 x float> {{.*}} [[B:%.*]], <2 x 
float> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = shufflevector <2 x float> {{.*}}, <2 x float> 
{{.*}}, <2 x i32> <i32 1, i32 1>
+// LLVM:      [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
[[B_CAST:%.*]], <2 x float> [[LANE]], <2 x float> [[A_CAST:%.*]])
+// LLVM:      ret <2 x float> [[FMA]]
+  return vfma_lane_f32(a, b, v, 1);
+}
+
+// ALL-LABEL: @test_vfma_lane_f64(
+float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<1 x 
!cir.double>) [#cir.int<0> : !s32i] : !cir.vector<1 x !cir.double>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<1 x !cir.double>, !cir.vector<1 x !cir.double>, !cir.vector<1 x 
!cir.double>) -> !cir.vector<1 x !cir.double>
+
+// LLVM-SAME: <1 x double> {{.*}} [[A:%.*]], <1 x double> {{.*}} [[B:%.*]], <1 
x double> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = shufflevector <1 x double> {{.*}}, <1 x double> 
{{.*}}, <1 x i32> zeroinitializer
+// LLVM:      [[FMA:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> 
[[B_CAST:%.*]], <1 x double> [[LANE]], <1 x double> [[A_CAST:%.*]])
+// LLVM:      ret <1 x double> [[FMA]]
+  return vfma_lane_f64(a, b, v, 0);
+}
+
 // ALL-LABEL: @test_vfmaq_lane_f32(
 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
 // CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x 
!cir.float>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, 
#cir.int<1> : !s32i] : !cir.vector<4 x !cir.float>

>From cd4e4e6bb67c56b51a539ab6124fd781554aa242 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Tue, 16 Jun 2026 15:08:57 +0300
Subject: [PATCH 3/4] [CIR][AArch64] Lower vfmas lane builtins

Lower BI__builtin_neon_vfmas_lane_f32 and
BI__builtin_neon_vfmas_laneq_f32 by extracting the selected lane and
emitting llvm.fma.

Add scalar f32 lane and laneq coverage.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 28 +++++++++++++++++--
 .../CodeGen/AArch64/neon/fused-multiply.c     | 24 ++++++++++++++++
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index a692642ee7306..5be3d4f98c9f2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2737,9 +2737,33 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps);
   }
   case NEON::BI__builtin_neon_vfmah_lane_f16:
-  case NEON::BI__builtin_neon_vfmas_lane_f32:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
+  case NEON::BI__builtin_neon_vfmas_lane_f32: {
+    // Scalar lane/laneq forms use one selected element from the lane source.
+    mlir::Value laneSource = builder.createExtractElement(
+        loc, ops[2], static_cast<uint64_t>(getIntValueFromConstOp(ops[3])));
+
+    llvm::SmallVector<mlir::Value> fmaOps = {ops[1], laneSource, ops[0]};
+    return emitCallMaybeConstrainedBuiltin(
+        builder, loc, "fma", convertType(expr->getType()), fmaOps);
+  }
   case NEON::BI__builtin_neon_vfmah_laneq_f16:
-  case NEON::BI__builtin_neon_vfmas_laneq_f32:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
+  case NEON::BI__builtin_neon_vfmas_laneq_f32: {
+    // Scalar lane/laneq forms use one selected element from the lane source.
+    mlir::Value laneSource = builder.createExtractElement(
+        loc, ops[2], static_cast<uint64_t>(getIntValueFromConstOp(ops[3])));
+
+    llvm::SmallVector<mlir::Value> fmaOps = {ops[1], laneSource, ops[0]};
+    return emitCallMaybeConstrainedBuiltin(
+        builder, loc, "fma", convertType(expr->getType()), fmaOps);
+  }
   case NEON::BI__builtin_neon_vfmad_lane_f64:
     cgm.errorNYI(expr->getSourceRange(),
                  std::string("unimplemented AArch64 builtin call: ") +
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
index 8347038cea0db..8938958c05f38 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -310,6 +310,30 @@ float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, 
float64x2_t b,
   return vfmaq_laneq_f64(a, b, v, 0);
 }
 
+// ALL-LABEL: @test_vfmas_lane_f32(
+float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : 
!cir.vector<2 x !cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.float, !cir.float, !cir.float) -> !cir.float
+
+// LLVM-SAME: float {{.*}} [[A:%.*]], float {{.*}} [[B:%.*]], <2 x float> 
{{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = extractelement <2 x float> [[C]], i{{32|64}} 1
+// LLVM:      [[FMA:%.*]] = call float @llvm.fma.f32(float [[B]], float 
[[LANE]], float [[A]])
+// LLVM:      ret float [[FMA]]
+  return vfmas_lane_f32(a, b, c, 1);
+}
+
+// ALL-LABEL: @test_vfmas_laneq_f32(
+float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : 
!cir.vector<4 x !cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.float, !cir.float, !cir.float) -> !cir.float
+
+// LLVM-SAME: float {{.*}} [[A:%.*]], float {{.*}} [[B:%.*]], <4 x float> 
{{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = extractelement <4 x float> [[C]], i{{32|64}} 3
+// LLVM:      [[FMA:%.*]] = call float @llvm.fma.f32(float [[B]], float 
[[LANE]], float [[A]])
+// LLVM:      ret float [[FMA]]
+  return vfmas_laneq_f32(a, b, c, 3);
+}
+
 // ALL-LABEL: @test_vfmad_laneq_f64(
 float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
 // CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : 
!cir.vector<2 x !cir.double>

>From e2df24008b01866204aee437bdc222ef752c68a7 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Tue, 23 Jun 2026 11:44:18 +0300
Subject: [PATCH 4/4] [CIR][AArch64] Remove moved FMA tests

Remove legacy normal CodeGen checks for NEON FMA wrappers that are now
covered by the CIR-enabled fused-multiply tests.

Leave constrained-FP coverage and unrelated lane variants in place.
---
 clang/test/CodeGen/AArch64/neon-2velem.c      | 29 -------------
 clang/test/CodeGen/AArch64/neon-intrinsics.c  | 42 -------------------
 .../AArch64/neon-scalar-x-indexed-elem.c      | 34 ---------------
 .../CodeGen/AArch64/v8.2a-neon-intrinsics.c   | 39 -----------------
 4 files changed, 144 deletions(-)

diff --git a/clang/test/CodeGen/AArch64/neon-2velem.c 
b/clang/test/CodeGen/AArch64/neon-2velem.c
index c7eca2d8426c6..273d8ce3cb316 100644
--- a/clang/test/CodeGen/AArch64/neon-2velem.c
+++ b/clang/test/CodeGen/AArch64/neon-2velem.c
@@ -405,25 +405,6 @@ uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t 
v) {
   return vmulq_laneq_u32(a, v, 3);
 }
 
-// CHECK-LABEL: @test_vfma_lane_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x 
float> [[TMP6]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
[[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
-// CHECK-NEXT:    ret <2 x float> [[FMLA2]]
-//
-float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  return vfma_lane_f32(a, b, v, 1);
-}
-
 // CHECK-LABEL: @test_vfms_lane_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
@@ -545,16 +526,6 @@ float64x2_t test_vfmsq_laneq_f64(float64x2_t a, 
float64x2_t b, float64x2_t v) {
   return vfmsq_laneq_f64(a, b, v, 1);
 }
 
-// CHECK-LABEL: @test_vfmas_laneq_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B:%.*]], 
float [[EXTRACT]], float [[A:%.*]])
-// CHECK-NEXT:    ret float [[TMP0]]
-//
-float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
-  return vfmas_laneq_f32(a, b, v, 3);
-}
-
 // CHECK-LABEL: @test_vfmsd_lane_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[B:%.*]]
diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c 
b/clang/test/CodeGen/AArch64/neon-intrinsics.c
index b37ed5aa29f10..37c9bc9ea8dee 100644
--- a/clang/test/CodeGen/AArch64/neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c
@@ -487,25 +487,6 @@ float64x2_t test_vmlsq_f64(float64x2_t v1, float64x2_t v2, 
float64x2_t v3) {
   return vmlsq_f64(v1, v2, v3);
 }
 
-// CHECK-LABEL: define dso_local <2 x float> @test_vfma_f32(
-// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], 
<2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V1]] to <2 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V3]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
[[TMP7]], <2 x float> [[TMP8]], <2 x float> [[TMP6]])
-// CHECK-NEXT:    ret <2 x float> [[TMP9]]
-//
-float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
-  return vfma_f32(v1, v2, v3);
-}
-
 // CHECK-LABEL: define dso_local <2 x float> @test_vfms_f32(
 // CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], 
<2 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -17393,28 +17374,6 @@ float64x1_t test_vmls_f64(float64x1_t a, float64x1_t 
b, float64x1_t c) {
   return vmls_f64(a, b, c);
 }
 
-// CHECK-LABEL: define dso_local <1 x double> @test_vfma_f64(
-// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], 
<1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
-// CHECK-NEXT:    [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x 
i64> undef, i64 [[TMP0]], i32 0
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
-// CHECK-NEXT:    [[__P1_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x 
i64> undef, i64 [[TMP1]], i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[C]] to i64
-// CHECK-NEXT:    [[__P2_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x 
i64> undef, i64 [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> 
[[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> 
[[__P1_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> 
[[__P2_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x 
double> [[TMP7]], <1 x double> [[TMP8]], <1 x double> [[TMP6]])
-// CHECK-NEXT:    ret <1 x double> [[TMP9]]
-//
-float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
-  return vfma_f64(a, b, c);
-}
-
 // CHECK-LABEL: define dso_local <1 x double> @test_vfms_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], 
<1 x double> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -17788,4 +17747,3 @@ float64x1_t test_vrecps_f64(float64x1_t a, float64x1_t 
b) {
 float64x1_t test_vrsqrts_f64(float64x1_t a, float64x1_t b) {
   return vrsqrts_f64(a, b);
 }
-
diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c 
b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
index fdb772a79e973..59be5ac738278 100644
--- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
+++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
@@ -148,17 +148,6 @@ float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, 
float64x2_t b) {
 }
 
 
-// CHECK-LABEL: define dso_local float @test_vfmas_lane_f32(
-// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> 
noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B]], float 
[[EXTRACT]], float [[A]])
-// CHECK-NEXT:    ret float [[TMP0]]
-//
-float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
-  return vfmas_lane_f32(a, b, c, 1);
-}
-
 // CHECK-LABEL: define dso_local double @test_vfmad_lane_f64(
 // CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <1 x 
double> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -182,29 +171,6 @@ float32_t test_vfmss_lane_f32(float32_t a, float32_t b, 
float32x2_t c) {
   return vfmss_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define dso_local <1 x double> @test_vfma_lane_f64(
-// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], 
<1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
-// CHECK-NEXT:    [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> 
undef, i64 [[TMP0]], i32 0
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
-// CHECK-NEXT:    [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> 
undef, i64 [[TMP1]], i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64
-// CHECK-NEXT:    [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> 
undef, i64 [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> 
[[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> 
[[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> 
[[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x 
double> [[TMP6]], <1 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x 
double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
-// CHECK-NEXT:    ret <1 x double> [[FMLA2]]
-//
-float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
-  return vfma_lane_f64(a, b, v, 0);
-}
-
 // CHECK-LABEL: define dso_local <1 x double> @test_vfms_lane_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], 
<1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c 
b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
index 669c426b42687..a2d6c9d0bff89 100644
--- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
@@ -1550,25 +1550,6 @@ float16x8_t test_vsubq_f16(float16x8_t a, float16x8_t b) 
{
   return vsubq_f16(a, b);
 }
 
-// CHECK-LABEL: define {{[^@]+}}@test_vfma_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> 
[[TMP7]], <4 x half> [[TMP8]], <4 x half> [[TMP6]])
-// CHECK-NEXT:    ret <4 x half> [[TMP9]]
-//
-float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
-  return vfma_f16(a, b, c);
-}
-
 // CHECK-LABEL: define {{[^@]+}}@test_vfms_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
@@ -1609,26 +1590,6 @@ float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, 
float16x8_t c) {
   return vfmsq_f16(a, b, c);
 }
 
-// CHECK-LABEL: define {{[^@]+}}@test_vfma_lane_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> 
[[TMP6]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> 
[[FMLA]], <4 x half> [[LANE]], <4 x half> [[FMLA1]])
-// CHECK-NEXT:    ret <4 x half> [[FMLA2]]
-//
-float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
-  return vfma_lane_f16(a, b, c, 3);
-}
-
 // CHECK-LABEL: define {{[^@]+}}@test_vfma_n_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], 
half noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [CIR][AArch64] Lower vfma_v, vfma_lane_v, and vfmas lane builtins (PR #204819)

Reply via email to