[clang] [CIR][AArch64] Lower vfma lane builtins (PR #188190)

Yair Ben Avraham via cfe-commits Sun, 05 Apr 2026 09:30:56 -0700

https://github.com/yairbenavraham updated 
https://github.com/llvm/llvm-project/pull/188190


>From 4ceade9630502af988e42a046e7568b3a71e96f5 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Wed, 25 Mar 2026 12:08:26 +0200
Subject: [PATCH 1/4] [CIR][AArch64] Lower vfma lane builtins

Lower the AArch64 vfma lane and laneq builtins in CIR codegen.

This adds handling for the vector and scalar vfma lane forms,
including the vfmaq_laneq_v family called out in the issue, and
keeps the CIR builtin structure aligned with the existing AArch64
builtin lowering pattern while preserving the original case order.

The scalar lane forms are dispatched before getNeonType() so the
f16 cases do not fall through the unsupported Poly128 path during
ClangIR lowering.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 66 ++++++++++++++-----
 1 file changed, 50 insertions(+), 16 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index a3488bfcc3dec..c972e9e12c430 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -139,11 +139,10 @@ static cir::VectorType getNeonType(CIRGenFunction *cgf, 
NeonTypeFlags typeFlags,
       cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: BFloat16"));
     [[fallthrough]];
   case NeonTypeFlags::Float16:
-    if (hasLegalHalfType)
+    if (!hasLegalHalfType)
       cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Float16"));
-    else
-      cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Float16"));
-    [[fallthrough]];
+    return cir::VectorType::get(cgf->getCIRGenModule().fP16Ty,
+                                v1Ty ? 1 : (4 << isQuad));
   case NeonTypeFlags::Int32:
     return cir::VectorType::get(typeFlags.isUnsigned() ? cgf->uInt32Ty
                                                        : cgf->sInt32Ty,
@@ -628,11 +627,6 @@ static bool hasExtraNeonArgument(unsigned builtinID) {
   case ARM::BI__builtin_arm_vcvtr_d:
     mask = 1;
   }
-  switch (builtinID) {
-  default:
-    break;
-  }
-
   return mask != 0;
 }
 
@@ -2186,6 +2180,23 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return mlir::Value{};
   }
 
+  switch (builtinID) {
+  case NEON::BI__builtin_neon_vfmah_lane_f16:
+  case NEON::BI__builtin_neon_vfmas_lane_f32:
+  case NEON::BI__builtin_neon_vfmah_laneq_f16:
+  case NEON::BI__builtin_neon_vfmas_laneq_f32:
+  case NEON::BI__builtin_neon_vfmad_lane_f64:
+  case NEON::BI__builtin_neon_vfmad_laneq_f64: {
+    mlir::Value lane = cir::VecExtractOp::create(builder, loc, ops[2], ops[3]);
+    mlir::Type scalarTy = convertType(expr->getType());
+    llvm::SmallVector<mlir::Value> fmaOps = {ops[1], lane, ops[0]};
+    return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", scalarTy,
+                                           fmaOps);
+  }
+  default:
+    break;
+  }
+
   cir::VectorType ty = getNeonType(this, type, loc);
   if (!ty)
     return nullptr;
@@ -2200,13 +2211,36 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
   case NEON::BI__builtin_neon_vfma_lane_v:
   case NEON::BI__builtin_neon_vfmaq_lane_v:
   case NEON::BI__builtin_neon_vfma_laneq_v:
-  case NEON::BI__builtin_neon_vfmaq_laneq_v:
-  case NEON::BI__builtin_neon_vfmah_lane_f16:
-  case NEON::BI__builtin_neon_vfmas_lane_f32:
-  case NEON::BI__builtin_neon_vfmah_laneq_f16:
-  case NEON::BI__builtin_neon_vfmas_laneq_f32:
-  case NEON::BI__builtin_neon_vfmad_lane_f64:
-  case NEON::BI__builtin_neon_vfmad_laneq_f64:
+  case NEON::BI__builtin_neon_vfmaq_laneq_v: {
+    mlir::Value addend = ops[0];
+    mlir::Value multiplicand = ops[1];
+    mlir::Value laneSource = ops[2];
+    auto vecTy = mlir::cast<cir::VectorType>(ty);
+    auto elemTy = vecTy.getElementType();
+    auto numElts = vecTy.getSize();
+
+    if (addend.getType() != ty)
+      addend = builder.createBitcast(loc, addend, ty);
+    if (multiplicand.getType() != ty)
+      multiplicand = builder.createBitcast(loc, multiplicand, ty);
+
+    cir::VectorType sourceTy = ty;
+    if (builtinID == NEON::BI__builtin_neon_vfmaq_lane_v)
+      sourceTy = cir::VectorType::get(elemTy, numElts / 2);
+    else if (builtinID == NEON::BI__builtin_neon_vfma_laneq_v)
+      sourceTy = cir::VectorType::get(elemTy, numElts * 2);
+
+    if (laneSource.getType() != sourceTy)
+      laneSource = builder.createBitcast(loc, laneSource, sourceTy);
+
+    int64_t lane =
+        expr->getArg(3)->EvaluateKnownConstInt(getContext()).getSExtValue();
+    llvm::SmallVector<int64_t> mask(numElts, lane);
+    mlir::Value splat = builder.createVecShuffle(loc, laneSource, mask);
+
+    llvm::SmallVector<mlir::Value> fmaOps = {multiplicand, splat, addend};
+    return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps);
+  }
   case NEON::BI__builtin_neon_vmull_v:
   case NEON::BI__builtin_neon_vmax_v:
   case NEON::BI__builtin_neon_vmaxq_v:

>From ae6b618b696899275c900931323b2401499e8cb9 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Wed, 25 Mar 2026 12:08:27 +0200
Subject: [PATCH 2/4] [CIR][AArch64] Add vfma lane tests

Add focused AArch64 NEON tests for the vfma lane and laneq builtins.

The tests cover the vector and scalar forms used by this patch series
and are placed under clang/test/CodeGen/AArch64/neon for CIR-enabled
validation.

The corresponding legacy coverage is removed from the old AArch64 test
files so the new neon tests become the canonical checks for these cases.
---
 clang/test/CodeGen/AArch64/neon-2velem.c      | 225 ------------------
 .../AArch64/neon-scalar-x-indexed-elem.c      |  83 -------
 clang/test/CodeGen/AArch64/neon/vfma-lane.c   | 126 ++++++++++
 .../CodeGen/AArch64/neon/vfma-scalar-lane.c   |  67 ++++++
 .../CodeGen/AArch64/v8.2a-neon-intrinsics.c   | 104 --------
 5 files changed, 193 insertions(+), 412 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/neon/vfma-lane.c
 create mode 100644 clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c

diff --git a/clang/test/CodeGen/AArch64/neon-2velem.c 
b/clang/test/CodeGen/AArch64/neon-2velem.c
index 2bc7212cde9f8..de95a1983f574 100644
--- a/clang/test/CodeGen/AArch64/neon-2velem.c
+++ b/clang/test/CodeGen/AArch64/neon-2velem.c
@@ -404,83 +404,6 @@ uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) 
{
 uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
   return vmulq_laneq_u32(a, v, 3);
 }
-
-// CHECK-LABEL: @test_vfma_lane_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x 
float> [[TMP6]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
[[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
-// CHECK-NEXT:    ret <2 x float> [[FMLA2]]
-//
-float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  return vfma_lane_f32(a, b, v, 1);
-}
-
-// CHECK-LABEL: @test_vfmaq_lane_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x 
float> [[TMP6]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> 
[[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
-// CHECK-NEXT:    ret <4 x float> [[FMLA2]]
-//
-float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  return vfmaq_lane_f32(a, b, v, 1);
-}
-
-// CHECK-LABEL: @test_vfma_laneq_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x 
float> [[TMP8]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
[[LANE]], <2 x float> [[TMP7]], <2 x float> [[TMP6]])
-// CHECK-NEXT:    ret <2 x float> [[TMP9]]
-//
-float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  return vfma_laneq_f32(a, b, v, 3);
-}
-
-// CHECK-LABEL: @test_vfmaq_laneq_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x 
float> [[TMP8]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> 
[[LANE]], <4 x float> [[TMP7]], <4 x float> [[TMP6]])
-// CHECK-NEXT:    ret <4 x float> [[TMP9]]
-//
-float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  return vfmaq_laneq_f32(a, b, v, 3);
-}
-
 // CHECK-LABEL: @test_vfms_lane_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
@@ -560,46 +483,6 @@ float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t 
b, float32x4_t v) {
 float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
   return vfmsq_laneq_f32(a, b, v, 3);
 }
-
-// CHECK-LABEL: @test_vfmaq_lane_f64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to i64
-// CHECK-NEXT:    [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> 
undef, i64 [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> 
[[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x 
double> [[TMP6]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x 
double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
-// CHECK-NEXT:    ret <2 x double> [[FMLA2]]
-//
-float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
-  return vfmaq_lane_f64(a, b, v, 0);
-}
-
-// CHECK-LABEL: @test_vfmaq_laneq_f64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x 
double> [[TMP8]], <2 x i32> <i32 1, i32 1>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x 
double> [[LANE]], <2 x double> [[TMP7]], <2 x double> [[TMP6]])
-// CHECK-NEXT:    ret <2 x double> [[TMP9]]
-//
-float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
-  return vfmaq_laneq_f64(a, b, v, 1);
-}
-
 // CHECK-LABEL: @test_vfmsq_lane_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
@@ -640,17 +523,6 @@ float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t 
b, float64x1_t v) {
 float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
   return vfmsq_laneq_f64(a, b, v, 1);
 }
-
-// CHECK-LABEL: @test_vfmas_laneq_f32(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B:%.*]], 
float [[EXTRACT]], float [[A:%.*]])
-// CHECK-NEXT:    ret float [[TMP0]]
-//
-float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
-  return vfmas_laneq_f32(a, b, v, 3);
-}
-
 // CHECK-LABEL: @test_vfmsd_lane_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[B:%.*]]
@@ -2547,83 +2419,6 @@ uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, 
uint32x4_t v) {
 uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
   return vmulq_laneq_u32(a, v, 0);
 }
-
-// CHECK-LABEL: @test_vfma_lane_f32_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x 
float> [[TMP6]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
[[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
-// CHECK-NEXT:    ret <2 x float> [[FMLA2]]
-//
-float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  return vfma_lane_f32(a, b, v, 0);
-}
-
-// CHECK-LABEL: @test_vfmaq_lane_f32_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x 
float> [[TMP6]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> 
[[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
-// CHECK-NEXT:    ret <4 x float> [[FMLA2]]
-//
-float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) 
{
-  return vfmaq_lane_f32(a, b, v, 0);
-}
-
-// CHECK-LABEL: @test_vfma_laneq_f32_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <2 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x 
float> [[TMP8]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
[[LANE]], <2 x float> [[TMP7]], <2 x float> [[TMP6]])
-// CHECK-NEXT:    ret <2 x float> [[TMP9]]
-//
-float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) 
{
-  return vfma_laneq_f32(a, b, v, 0);
-}
-
-// CHECK-LABEL: @test_vfmaq_laneq_f32_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x 
float> [[TMP8]], <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> 
[[LANE]], <4 x float> [[TMP7]], <4 x float> [[TMP6]])
-// CHECK-NEXT:    ret <4 x float> [[TMP9]]
-//
-float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t 
v) {
-  return vfmaq_laneq_f32(a, b, v, 0);
-}
-
 // CHECK-LABEL: @test_vfms_lane_f32_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32>
@@ -2703,26 +2498,6 @@ float32x2_t test_vfms_laneq_f32_0(float32x2_t a, 
float32x2_t b, float32x4_t v) {
 float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t 
v) {
   return vfmsq_laneq_f32(a, b, v, 0);
 }
-
-// CHECK-LABEL: @test_vfmaq_laneq_f64_0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x 
double> [[TMP8]], <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x 
double> [[LANE]], <2 x double> [[TMP7]], <2 x double> [[TMP6]])
-// CHECK-NEXT:    ret <2 x double> [[TMP9]]
-//
-float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t 
v) {
-  return vfmaq_laneq_f64(a, b, v, 0);
-}
-
 // CHECK-LABEL: @test_vfmsq_laneq_f64_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64>
diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c 
b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
index 9b98126500444..d4f1abb0adb27 100644
--- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
+++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c
@@ -146,41 +146,6 @@ float64x1_t test_vmulx_laneq_f64_0(float64x1_t a, 
float64x2_t b) {
 float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, float64x2_t b) {
   return vmulx_laneq_f64(a, b, 1);
 }
-
-
-// CHECK-LABEL: define dso_local float @test_vfmas_lane_f32(
-// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> 
noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B]], float 
[[EXTRACT]], float [[A]])
-// CHECK-NEXT:    ret float [[TMP0]]
-//
-float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
-  return vfmas_lane_f32(a, b, c, 1);
-}
-
-// CHECK-LABEL: define dso_local double @test_vfmad_lane_f64(
-// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <1 x 
double> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <1 x double> [[C]], i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], 
double [[EXTRACT]], double [[A]])
-// CHECK-NEXT:    ret double [[TMP0]]
-//
-float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
-  return vfmad_lane_f64(a, b, c, 0);
-}
-
-// CHECK-LABEL: define dso_local double @test_vfmad_laneq_f64(
-// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <2 x 
double> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[C]], i32 1
-// CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], 
double [[EXTRACT]], double [[A]])
-// CHECK-NEXT:    ret double [[TMP0]]
-//
-float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
-  return vfmad_laneq_f64(a, b, c, 1);
-}
-
 // CHECK-LABEL: define dso_local float @test_vfmss_lane_f32(
 // CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> 
noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -192,30 +157,6 @@ float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, 
float64x2_t c) {
 float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
   return vfmss_lane_f32(a, b, c, 1);
 }
-
-// CHECK-LABEL: define dso_local <1 x double> @test_vfma_lane_f64(
-// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], 
<1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
-// CHECK-NEXT:    [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> 
undef, i64 [[TMP0]], i32 0
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
-// CHECK-NEXT:    [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> 
undef, i64 [[TMP1]], i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64
-// CHECK-NEXT:    [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> 
undef, i64 [[TMP2]], i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> 
[[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> 
[[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> 
[[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x 
double> [[TMP6]], <1 x i32> zeroinitializer
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x 
double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
-// CHECK-NEXT:    ret <1 x double> [[FMLA2]]
-//
-float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
-  return vfma_lane_f64(a, b, v, 0);
-}
-
 // CHECK-LABEL: define dso_local <1 x double> @test_vfms_lane_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], 
<1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -239,30 +180,6 @@ float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t 
b, float64x1_t v) {
 float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
   return vfms_lane_f64(a, b, v, 0);
 }
-
-// CHECK-LABEL: define dso_local <1 x double> @test_vfma_laneq_f64(
-// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], 
<2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
-// CHECK-NEXT:    [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> 
undef, i64 [[TMP0]], i32 0
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
-// CHECK-NEXT:    [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> 
undef, i64 [[TMP1]], i32 0
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> 
[[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> 
[[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
-// CHECK-NEXT:    [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], 
double [[EXTRACT]], double [[TMP6]])
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double>
-// CHECK-NEXT:    ret <1 x double> [[TMP10]]
-//
-float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
-  return vfma_laneq_f64(a, b, v, 0);
-}
-
 // CHECK-LABEL: define dso_local <1 x double> @test_vfms_laneq_f64(
 // CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], 
<2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
diff --git a/clang/test/CodeGen/AArch64/neon/vfma-lane.c 
b/clang/test/CodeGen/AArch64/neon/vfma-lane.c
new file mode 100644
index 0000000000000..ca0fe7805ec12
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/neon/vfma-lane.c
@@ -0,0 +1,126 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN:                   %clang_cc1 -triple arm64-none-linux-gnu 
-target-feature +neon -target-feature +fullfp16 -disable-O0-optnone 
-flax-vector-conversions=none           -emit-llvm -o - %s | opt -S 
-passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu 
-target-feature +neon -target-feature +fullfp16 -disable-O0-optnone 
-flax-vector-conversions=none -fclangir -emit-llvm -o - %s | opt -S 
-passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM %}
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu 
-target-feature +neon -target-feature +fullfp16 -disable-O0-optnone 
-flax-vector-conversions=none -fclangir -emit-cir  -o - %s |                    
           FileCheck %s --check-prefixes=CIR %}
+
+#include <arm_neon.h>
+
+// LLVM-LABEL: @test_vfma_lane_f16(
+// LLVM: shufflevector <4 x half>
+// LLVM: call <4 x half> @llvm.fma.v4f16(
+// CIR-LABEL: @test_vfma_lane_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
+  return vfma_lane_f16(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfmaq_lane_f16(
+// LLVM: shufflevector <4 x half>
+// LLVM: call <8 x half> @llvm.fma.v8f16(
+// CIR-LABEL: @test_vfmaq_lane_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
+  return vfmaq_lane_f16(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfma_laneq_f16(
+// LLVM: shufflevector <8 x half>
+// LLVM: call <4 x half> @llvm.fma.v4f16(
+// CIR-LABEL: @test_vfma_laneq_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
+  return vfma_laneq_f16(a, b, c, 7);
+}
+
+// LLVM-LABEL: @test_vfmaq_laneq_f16(
+// LLVM: shufflevector <8 x half>
+// LLVM: call <8 x half> @llvm.fma.v8f16(
+// CIR-LABEL: @test_vfmaq_laneq_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
+  return vfmaq_laneq_f16(a, b, c, 7);
+}
+
+// LLVM-LABEL: @test_vfma_lane_f32(
+// LLVM: shufflevector <2 x float>
+// LLVM: call <2 x float> @llvm.fma.v2f32(
+// CIR-LABEL: @test_vfma_lane_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
+  return vfma_lane_f32(a, b, v, 1);
+}
+
+// LLVM-LABEL: @test_vfmaq_lane_f32(
+// LLVM: shufflevector <2 x float>
+// LLVM: call <4 x float> @llvm.fma.v4f32(
+// CIR-LABEL: @test_vfmaq_lane_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
+  return vfmaq_lane_f32(a, b, v, 1);
+}
+
+// LLVM-LABEL: @test_vfma_laneq_f32(
+// LLVM: shufflevector <4 x float>
+// LLVM: call <2 x float> @llvm.fma.v2f32(
+// CIR-LABEL: @test_vfma_laneq_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
+  return vfma_laneq_f32(a, b, v, 3);
+}
+
+// LLVM-LABEL: @test_vfmaq_laneq_f32(
+// LLVM: shufflevector <4 x float>
+// LLVM: call <4 x float> @llvm.fma.v4f32(
+// CIR-LABEL: @test_vfmaq_laneq_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
+  return vfmaq_laneq_f32(a, b, v, 3);
+}
+
+// LLVM-LABEL: @test_vfma_lane_f64(
+// LLVM: shufflevector <1 x double>
+// LLVM: call <1 x double> @llvm.fma.v1f64(
+// CIR-LABEL: @test_vfma_lane_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
+  return vfma_lane_f64(a, b, v, 0);
+}
+
+// LLVM-LABEL: @test_vfmaq_lane_f64(
+// LLVM: shufflevector <1 x double>
+// LLVM: call <2 x double> @llvm.fma.v2f64(
+// CIR-LABEL: @test_vfmaq_lane_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
+  return vfmaq_lane_f64(a, b, v, 0);
+}
+
+// LLVM-LABEL: @test_vfma_laneq_f64(
+// LLVM: @llvm.fma
+// CIR-LABEL: @test_vfma_laneq_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
+  return vfma_laneq_f64(a, b, v, 0);
+}
+
+// LLVM-LABEL: @test_vfmaq_laneq_f64(
+// LLVM: shufflevector <2 x double>
+// LLVM: call <2 x double> @llvm.fma.v2f64(
+// CIR-LABEL: @test_vfmaq_laneq_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
+  return vfmaq_laneq_f64(a, b, v, 1);
+}
diff --git a/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c 
b/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c
new file mode 100644
index 0000000000000..42e0c211d6dc6
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c
@@ -0,0 +1,67 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN:                   %clang_cc1 -triple arm64-none-linux-gnu 
-target-feature +neon -target-feature +fullfp16 -disable-O0-optnone 
-flax-vector-conversions=none           -emit-llvm -o - %s | opt -S 
-passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu 
-target-feature +neon -target-feature +fullfp16 -disable-O0-optnone 
-flax-vector-conversions=none -fclangir -emit-llvm -o - %s | opt -S 
-passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM %}
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu 
-target-feature +neon -target-feature +fullfp16 -disable-O0-optnone 
-flax-vector-conversions=none -fclangir -emit-cir  -o - %s |                    
           FileCheck %s --check-prefixes=CIR %}
+
+#include <arm_neon.h>
+
+// LLVM-LABEL: @test_vfmah_lane_f16(
+// LLVM: extractelement <4 x half>
+// LLVM: call half @llvm.fma.f16(
+// CIR-LABEL: @test_vfmah_lane_f16(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) {
+  return vfmah_lane_f16(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfmah_laneq_f16(
+// LLVM: extractelement <8 x half>
+// LLVM: call half @llvm.fma.f16(
+// CIR-LABEL: @test_vfmah_laneq_f16(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
+  return vfmah_laneq_f16(a, b, c, 7);
+}
+
+// LLVM-LABEL: @test_vfmas_lane_f32(
+// LLVM: extractelement <2 x float>
+// LLVM: call float @llvm.fma.f32(
+// CIR-LABEL: @test_vfmas_lane_f32(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
+  return vfmas_lane_f32(a, b, c, 1);
+}
+
+// LLVM-LABEL: @test_vfmas_laneq_f32(
+// LLVM: extractelement <4 x float>
+// LLVM: call float @llvm.fma.f32(
+// CIR-LABEL: @test_vfmas_laneq_f32(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t c) {
+  return vfmas_laneq_f32(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfmad_lane_f64(
+// LLVM: extractelement <1 x double>
+// LLVM: call double @llvm.fma.f64(
+// CIR-LABEL: @test_vfmad_lane_f64(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
+  return vfmad_lane_f64(a, b, c, 0);
+}
+
+// LLVM-LABEL: @test_vfmad_laneq_f64(
+// LLVM: extractelement <2 x double>
+// LLVM: call double @llvm.fma.f64(
+// CIR-LABEL: @test_vfmad_laneq_f64(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
+  return vfmad_laneq_f64(a, b, c, 1);
+}
diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c 
b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
index 9c408e8c702fd..b331ae7eb63db 100644
--- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c
@@ -1679,87 +1679,6 @@ float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, 
float16x4_t c) {
 float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
   return vfmsq_f16(a, b, c);
 }
-
-// CHECK-LABEL: define {{[^@]+}}@test_vfma_lane_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> 
[[TMP6]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> 
[[FMLA]], <4 x half> [[LANE]], <4 x half> [[FMLA1]])
-// CHECK-NEXT:    ret <4 x half> [[FMLA2]]
-//
-float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
-  return vfma_lane_f16(a, b, c, 3);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vfmaq_lane_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> 
[[TMP6]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
-// CHECK-NEXT:    [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> 
[[FMLA]], <8 x half> [[LANE]], <8 x half> [[FMLA1]])
-// CHECK-NEXT:    ret <8 x half> [[FMLA2]]
-//
-float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
-  return vfmaq_lane_f16(a, b, c, 3);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vfma_laneq_f16
-// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP8]], <8 x half> 
[[TMP8]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> 
[[LANE]], <4 x half> [[TMP7]], <4 x half> [[TMP6]])
-// CHECK-NEXT:    ret <4 x half> [[TMP9]]
-//
-float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
-  return vfma_laneq_f16(a, b, c, 7);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vfmaq_laneq_f16
-// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
-// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
-// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
-// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half>
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP8]], <8 x half> 
[[TMP8]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> 
[[LANE]], <8 x half> [[TMP7]], <8 x half> [[TMP6]])
-// CHECK-NEXT:    ret <8 x half> [[TMP9]]
-//
-float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
-  return vfmaq_laneq_f16(a, b, c, 7);
-}
-
 // CHECK-LABEL: define {{[^@]+}}@test_vfma_n_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], 
half noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
@@ -1809,29 +1728,6 @@ float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t 
b, float16_t c) {
 float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
   return vfmaq_n_f16(a, b, c);
 }
-
-// CHECK-LABEL: define {{[^@]+}}@test_vfmah_lane_f16
-// CHECK-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> 
noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i32 3
-// CHECK-NEXT:    [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half 
[[EXTRACT]], half [[A]])
-// CHECK-NEXT:    ret half [[TMP0]]
-//
-float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) {
-  return vfmah_lane_f16(a, b, c, 3);
-}
-
-// CHECK-LABEL: define {{[^@]+}}@test_vfmah_laneq_f16
-// CHECK-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> 
noundef [[C:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i32 7
-// CHECK-NEXT:    [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half 
[[EXTRACT]], half [[A]])
-// CHECK-NEXT:    ret half [[TMP0]]
-//
-float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
-  return vfmah_laneq_f16(a, b, c, 7);
-}
-
 // CHECK-LABEL: define {{[^@]+}}@test_vfms_lane_f16
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:

>From 82eae963cd93063216a2242084d18c8be1fe6a82 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Sun, 5 Apr 2026 07:47:41 +0300
Subject: [PATCH 3/4] fixup! [CIR][AArch64] Add vfma lane tests

Update the ported vfma lane tests to restore the original LLVM
argument-tracking coverage and LLVM-SAME checks while keeping the CIR
coverage and the stable split between plain LLVM and ClangIR-to-LLVM
where their IR differs.
---
 clang/test/CodeGen/AArch64/neon/vfma-lane.c   | 212 +++++++++++++-----
 .../CodeGen/AArch64/neon/vfma-scalar-lane.c   |  78 ++++---
 2 files changed, 198 insertions(+), 92 deletions(-)

diff --git a/clang/test/CodeGen/AArch64/neon/vfma-lane.c 
b/clang/test/CodeGen/AArch64/neon/vfma-lane.c
index ca0fe7805ec12..25718245ce44a 100644
--- a/clang/test/CodeGen/AArch64/neon/vfma-lane.c
+++ b/clang/test/CodeGen/AArch64/neon/vfma-lane.c
@@ -1,126 +1,214 @@
 // REQUIRES: aarch64-registered-target
 
-// RUN:                   %clang_cc1 -triple arm64-none-linux-gnu 
-target-feature +neon -target-feature +fullfp16 -disable-O0-optnone 
-flax-vector-conversions=none           -emit-llvm -o - %s | opt -S 
-passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM
-// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu 
-target-feature +neon -target-feature +fullfp16 -disable-O0-optnone 
-flax-vector-conversions=none -fclangir -emit-llvm -o - %s | opt -S 
-passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM %}
+// RUN:                   %clang_cc1 -triple arm64-none-linux-gnu 
-target-feature +neon -target-feature +fullfp16 -disable-O0-optnone 
-flax-vector-conversions=none           -emit-llvm -o - %s | opt -S 
-passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM,PLAINLLVM
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu 
-target-feature +neon -target-feature +fullfp16 -disable-O0-optnone 
-flax-vector-conversions=none -fclangir -emit-llvm -o - %s | opt -S 
-passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM,CIRLLVM %}
 // RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu 
-target-feature +neon -target-feature +fullfp16 -disable-O0-optnone 
-flax-vector-conversions=none -fclangir -emit-cir  -o - %s |                    
           FileCheck %s --check-prefixes=CIR %}
 
 #include <arm_neon.h>
 
-// LLVM-LABEL: @test_vfma_lane_f16(
-// LLVM: shufflevector <4 x half>
-// LLVM: call <4 x half> @llvm.fma.v4f16(
+// LLVM-LABEL: define {{[^@]+}}@test_vfma_lane_f16
+// LLVM-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 
x half> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] {
 // CIR-LABEL: @test_vfma_lane_f16(
-// CIR: cir.vec.shuffle
-// CIR: cir.call_llvm_intrinsic "fma"
 float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
+  // CIR: cir.vec.shuffle
+  // CIR: cir.call_llvm_intrinsic "fma"
+
+  // LLVM: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
+  // LLVM: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16>
+  // LLVM: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16>
+  // LLVM: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+  // LLVM: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
+  // LLVM: shufflevector <4 x half> [[TMP6]], <4 x half> {{[^,]+}}, <4 x i32> 
<i32 3, i32 3, i32 3, i32 3>
+  // LLVM: call <4 x half> @llvm.fma.v4f16(
   return vfma_lane_f16(a, b, c, 3);
 }
 
-// LLVM-LABEL: @test_vfmaq_lane_f16(
-// LLVM: shufflevector <4 x half>
-// LLVM: call <8 x half> @llvm.fma.v8f16(
+// LLVM-LABEL: define {{[^@]+}}@test_vfmaq_lane_f16
+// LLVM-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CIR-LABEL: @test_vfmaq_lane_f16(
-// CIR: cir.vec.shuffle
-// CIR: cir.call_llvm_intrinsic "fma"
 float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
+  // CIR: cir.vec.shuffle
+  // CIR: cir.call_llvm_intrinsic "fma"
+
+  // LLVM: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
+  // LLVM: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
+  // LLVM: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16>
+  // LLVM: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+  // LLVM: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
+  // LLVM: shufflevector <4 x half> [[TMP6]], <4 x half> {{[^,]+}}, <8 x i32> 
<i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  // LLVM: call <8 x half> @llvm.fma.v8f16(
   return vfmaq_lane_f16(a, b, c, 3);
 }
 
-// LLVM-LABEL: @test_vfma_laneq_f16(
-// LLVM: shufflevector <8 x half>
-// LLVM: call <4 x half> @llvm.fma.v4f16(
+// LLVM-LABEL: define {{[^@]+}}@test_vfma_laneq_f16
+// LLVM-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CIR-LABEL: @test_vfma_laneq_f16(
-// CIR: cir.vec.shuffle
-// CIR: cir.call_llvm_intrinsic "fma"
 float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
+  // CIR: cir.vec.shuffle
+  // CIR: cir.call_llvm_intrinsic "fma"
+
+  // LLVM: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16>
+  // LLVM: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16>
+  // LLVM: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
+  // LLVM: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+  // LLVM: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
+  // LLVM: shufflevector <8 x half> [[TMP8]], <8 x half> {{[^,]+}}, <4 x i32> 
<i32 7, i32 7, i32 7, i32 7>
+  // LLVM: call <4 x half> @llvm.fma.v4f16(
   return vfma_laneq_f16(a, b, c, 7);
 }
 
-// LLVM-LABEL: @test_vfmaq_laneq_f16(
-// LLVM: shufflevector <8 x half>
-// LLVM: call <8 x half> @llvm.fma.v8f16(
+// LLVM-LABEL: define {{[^@]+}}@test_vfmaq_laneq_f16
+// LLVM-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 
x half> noundef [[C:%.*]]) #[[ATTR0]] {
 // CIR-LABEL: @test_vfmaq_laneq_f16(
-// CIR: cir.vec.shuffle
-// CIR: cir.call_llvm_intrinsic "fma"
 float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
+  // CIR: cir.vec.shuffle
+  // CIR: cir.call_llvm_intrinsic "fma"
+
+  // LLVM: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
+  // LLVM: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
+  // LLVM: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
+  // LLVM: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+  // LLVM: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
+  // LLVM: shufflevector <8 x half> [[TMP8]], <8 x half> {{[^,]+}}, <8 x i32> 
<i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  // LLVM: call <8 x half> @llvm.fma.v8f16(
   return vfmaq_laneq_f16(a, b, c, 7);
 }
 
-// LLVM-LABEL: @test_vfma_lane_f32(
-// LLVM: shufflevector <2 x float>
-// LLVM: call <2 x float> @llvm.fma.v2f32(
+// LLVM-LABEL: define {{[^@]+}}@test_vfma_lane_f32(
+// LLVM-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 
x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CIR-LABEL: @test_vfma_lane_f32(
-// CIR: cir.vec.shuffle
-// CIR: cir.call_llvm_intrinsic "fma"
 float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
+  // CIR: cir.vec.shuffle
+  // CIR: cir.call_llvm_intrinsic "fma"
+
+  // LLVM: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
+  // LLVM: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32>
+  // LLVM: [[TMP2:%.*]] = bitcast <2 x float> [[V]] to <2 x i32>
+  // LLVM: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+  // LLVM: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+  // LLVM: shufflevector <2 x float> [[TMP6]], <2 x float> {{[^,]+}}, <2 x 
i32> <i32 1, i32 1>
+  // LLVM: call <2 x float> @llvm.fma.v2f32(
   return vfma_lane_f32(a, b, v, 1);
 }
 
-// LLVM-LABEL: @test_vfmaq_lane_f32(
-// LLVM: shufflevector <2 x float>
-// LLVM: call <4 x float> @llvm.fma.v4f32(
+// LLVM-LABEL: define {{[^@]+}}@test_vfmaq_lane_f32(
+// LLVM-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 
x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CIR-LABEL: @test_vfmaq_lane_f32(
-// CIR: cir.vec.shuffle
-// CIR: cir.call_llvm_intrinsic "fma"
 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
+  // CIR: cir.vec.shuffle
+  // CIR: cir.call_llvm_intrinsic "fma"
+
+  // LLVM: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
+  // LLVM: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32>
+  // LLVM: [[TMP2:%.*]] = bitcast <2 x float> [[V]] to <2 x i32>
+  // LLVM: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+  // LLVM: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+  // LLVM: shufflevector <2 x float> [[TMP6]], <2 x float> {{[^,]+}}, <4 x 
i32> <i32 1, i32 1, i32 1, i32 1>
+  // LLVM: call <4 x float> @llvm.fma.v4f32(
   return vfmaq_lane_f32(a, b, v, 1);
 }
 
-// LLVM-LABEL: @test_vfma_laneq_f32(
-// LLVM: shufflevector <4 x float>
-// LLVM: call <2 x float> @llvm.fma.v2f32(
+// LLVM-LABEL: define {{[^@]+}}@test_vfma_laneq_f32(
+// LLVM-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <4 
x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CIR-LABEL: @test_vfma_laneq_f32(
-// CIR: cir.vec.shuffle
-// CIR: cir.call_llvm_intrinsic "fma"
 float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
+  // CIR: cir.vec.shuffle
+  // CIR: cir.call_llvm_intrinsic "fma"
+
+  // LLVM: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32>
+  // LLVM: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32>
+  // LLVM: [[TMP2:%.*]] = bitcast <4 x float> [[V]] to <4 x i32>
+  // LLVM: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+  // LLVM: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+  // LLVM: shufflevector <4 x float> [[TMP8]], <4 x float> {{[^,]+}}, <2 x 
i32> <i32 3, i32 3>
+  // LLVM: call <2 x float> @llvm.fma.v2f32(
   return vfma_laneq_f32(a, b, v, 3);
 }
 
-// LLVM-LABEL: @test_vfmaq_laneq_f32(
-// LLVM: shufflevector <4 x float>
-// LLVM: call <4 x float> @llvm.fma.v4f32(
+// LLVM-LABEL: define {{[^@]+}}@test_vfmaq_laneq_f32(
+// LLVM-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 
x float> noundef [[V:%.*]]) #[[ATTR0]] {
 // CIR-LABEL: @test_vfmaq_laneq_f32(
-// CIR: cir.vec.shuffle
-// CIR: cir.call_llvm_intrinsic "fma"
 float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
+  // CIR: cir.vec.shuffle
+  // CIR: cir.call_llvm_intrinsic "fma"
+
+  // LLVM: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
+  // LLVM: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32>
+  // LLVM: [[TMP2:%.*]] = bitcast <4 x float> [[V]] to <4 x i32>
+  // LLVM: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+  // LLVM: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+  // LLVM: shufflevector <4 x float> [[TMP8]], <4 x float> {{[^,]+}}, <4 x 
i32> <i32 3, i32 3, i32 3, i32 3>
+  // LLVM: call <4 x float> @llvm.fma.v4f32(
   return vfmaq_laneq_f32(a, b, v, 3);
 }
 
-// LLVM-LABEL: @test_vfma_lane_f64(
-// LLVM: shufflevector <1 x double>
-// LLVM: call <1 x double> @llvm.fma.v1f64(
+// LLVM-LABEL: define {{[^@]+}}@test_vfma_lane_f64(
+// LLVM-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], 
<1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
 // CIR-LABEL: @test_vfma_lane_f64(
-// CIR: cir.vec.shuffle
-// CIR: cir.call_llvm_intrinsic "fma"
 float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
+  // CIR: cir.vec.shuffle
+  // CIR: cir.call_llvm_intrinsic "fma"
+
+  // LLVM: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
+  // LLVM: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
+  // LLVM: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64
+  // LLVM: [[INS:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+  // LLVM: shufflevector <1 x double> {{[^,]+}}, <1 x double> {{[^,]+}}, <1 x 
i32> zeroinitializer
+  // LLVM: call <1 x double> @llvm.fma.v1f64(
   return vfma_lane_f64(a, b, v, 0);
 }
 
-// LLVM-LABEL: @test_vfmaq_lane_f64(
-// LLVM: shufflevector <1 x double>
-// LLVM: call <2 x double> @llvm.fma.v2f64(
+// LLVM-LABEL: define {{[^@]+}}@test_vfmaq_lane_f64(
+// LLVM-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]], 
<1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
 // CIR-LABEL: @test_vfmaq_lane_f64(
-// CIR: cir.vec.shuffle
-// CIR: cir.call_llvm_intrinsic "fma"
 float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
+  // CIR: cir.vec.shuffle
+  // CIR: cir.call_llvm_intrinsic "fma"
+
+  // LLVM: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+  // LLVM: [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <2 x i64>
+  // LLVM: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64
+  // LLVM: [[INS:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+  // LLVM: shufflevector <1 x double> {{[^,]+}}, <1 x double> {{[^,]+}}, <2 x 
i32> zeroinitializer
+  // LLVM: call <2 x double> @llvm.fma.v2f64(
   return vfmaq_lane_f64(a, b, v, 0);
 }
 
-// LLVM-LABEL: @test_vfma_laneq_f64(
-// LLVM: @llvm.fma
+// PLAINLLVM-LABEL: define {{[^@]+}}@test_vfma_laneq_f64(
+// PLAINLLVM-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef 
[[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
+// CIRLLVM-LABEL: define {{[^@]+}}@test_vfma_laneq_f64(
+// CIRLLVM-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef 
[[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
 // CIR-LABEL: @test_vfma_laneq_f64(
-// CIR: cir.vec.shuffle
-// CIR: cir.call_llvm_intrinsic "fma"
 float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
-  return vfma_laneq_f64(a, b, v, 0);
+  // CIR: cir.vec.shuffle
+  // CIR: cir.call_llvm_intrinsic "fma"
+
+  // PLAINLLVM: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
+  // PLAINLLVM: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
+  // PLAINLLVM: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
+  // PLAINLLVM: extractelement <2 x double>{{.*}}, i32 1
+  // PLAINLLVM: call double @llvm.fma.f64(
+  // CIRLLVM: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64
+  // CIRLLVM: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64
+  // CIRLLVM: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
+  // CIRLLVM: shufflevector <2 x double> {{[^,]+}}, <2 x double> {{[^,]+}}, <1 
x i32> <i32 1>
+  // CIRLLVM: call <1 x double> @llvm.fma.v1f64(
+  return vfma_laneq_f64(a, b, v, 1);
 }
 
-// LLVM-LABEL: @test_vfmaq_laneq_f64(
-// LLVM: shufflevector <2 x double>
-// LLVM: call <2 x double> @llvm.fma.v2f64(
+// LLVM-LABEL: define {{[^@]+}}@test_vfmaq_laneq_f64(
+// LLVM-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]], 
<2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
 // CIR-LABEL: @test_vfmaq_laneq_f64(
-// CIR: cir.vec.shuffle
-// CIR: cir.call_llvm_intrinsic "fma"
 float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
+  // CIR: cir.vec.shuffle
+  // CIR: cir.call_llvm_intrinsic "fma"
+
+  // LLVM: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+  // LLVM: [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <2 x i64>
+  // LLVM: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64>
+  // LLVM: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
+  // LLVM: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+  // LLVM: shufflevector <2 x double> [[TMP8]], <2 x double> {{[^,]+}}, <2 x 
i32> <i32 1, i32 1>
+  // LLVM: call <2 x double> @llvm.fma.v2f64(
   return vfmaq_laneq_f64(a, b, v, 1);
 }
diff --git a/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c 
b/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c
index 42e0c211d6dc6..523fa7a832de7 100644
--- a/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c
+++ b/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c
@@ -6,62 +6,80 @@
 
 #include <arm_neon.h>
 
-// LLVM-LABEL: @test_vfmah_lane_f16(
-// LLVM: extractelement <4 x half>
-// LLVM: call half @llvm.fma.f16(
+// LLVM-LABEL: define {{[^@]+}}@test_vfmah_lane_f16
+// LLVM-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> 
noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] {
 // CIR-LABEL: @test_vfmah_lane_f16(
-// CIR: cir.vec.extract
-// CIR: cir.call_llvm_intrinsic "fma"
 float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) {
+  // CIR: cir.vec.extract
+  // CIR: cir.call_llvm_intrinsic "fma"
+
+  // LLVM: [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i32 3
+  // LLVM: [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half 
[[EXTRACT]], half [[A]])
+  // LLVM: ret half [[TMP0]]
   return vfmah_lane_f16(a, b, c, 3);
 }
 
-// LLVM-LABEL: @test_vfmah_laneq_f16(
-// LLVM: extractelement <8 x half>
-// LLVM: call half @llvm.fma.f16(
+// LLVM-LABEL: define {{[^@]+}}@test_vfmah_laneq_f16
+// LLVM-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> 
noundef [[C:%.*]]) #[[ATTR0]] {
 // CIR-LABEL: @test_vfmah_laneq_f16(
-// CIR: cir.vec.extract
-// CIR: cir.call_llvm_intrinsic "fma"
 float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
+  // CIR: cir.vec.extract
+  // CIR: cir.call_llvm_intrinsic "fma"
+
+  // LLVM: [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i32 7
+  // LLVM: [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half 
[[EXTRACT]], half [[A]])
+  // LLVM: ret half [[TMP0]]
   return vfmah_laneq_f16(a, b, c, 7);
 }
 
-// LLVM-LABEL: @test_vfmas_lane_f32(
-// LLVM: extractelement <2 x float>
-// LLVM: call float @llvm.fma.f32(
+// LLVM-LABEL: define dso_local float @test_vfmas_lane_f32(
+// LLVM-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> 
noundef [[C:%.*]]) #[[ATTR0]] {
 // CIR-LABEL: @test_vfmas_lane_f32(
-// CIR: cir.vec.extract
-// CIR: cir.call_llvm_intrinsic "fma"
 float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
+  // CIR: cir.vec.extract
+  // CIR: cir.call_llvm_intrinsic "fma"
+
+  // LLVM: [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i32 1
+  // LLVM: [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B]], float 
[[EXTRACT]], float [[A]])
+  // LLVM: ret float [[TMP0]]
   return vfmas_lane_f32(a, b, c, 1);
 }
 
-// LLVM-LABEL: @test_vfmas_laneq_f32(
-// LLVM: extractelement <4 x float>
-// LLVM: call float @llvm.fma.f32(
+// LLVM-LABEL: define dso_local float @test_vfmas_laneq_f32(
+// LLVM-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <4 x float> 
noundef [[C:%.*]]) #[[ATTR0]] {
 // CIR-LABEL: @test_vfmas_laneq_f32(
-// CIR: cir.vec.extract
-// CIR: cir.call_llvm_intrinsic "fma"
 float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t c) {
+  // CIR: cir.vec.extract
+  // CIR: cir.call_llvm_intrinsic "fma"
+
+  // LLVM: [[EXTRACT:%.*]] = extractelement <4 x float> [[C]], i32 3
+  // LLVM: [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B]], float 
[[EXTRACT]], float [[A]])
+  // LLVM: ret float [[TMP0]]
   return vfmas_laneq_f32(a, b, c, 3);
 }
 
-// LLVM-LABEL: @test_vfmad_lane_f64(
-// LLVM: extractelement <1 x double>
-// LLVM: call double @llvm.fma.f64(
+// LLVM-LABEL: define dso_local double @test_vfmad_lane_f64(
+// LLVM-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <1 x double> 
noundef [[C:%.*]]) #[[ATTR0]] {
 // CIR-LABEL: @test_vfmad_lane_f64(
-// CIR: cir.vec.extract
-// CIR: cir.call_llvm_intrinsic "fma"
 float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
+  // CIR: cir.vec.extract
+  // CIR: cir.call_llvm_intrinsic "fma"
+
+  // LLVM: [[EXTRACT:%.*]] = extractelement <1 x double> [[C]], i32 0
+  // LLVM: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], double 
[[EXTRACT]], double [[A]])
+  // LLVM: ret double [[TMP0]]
   return vfmad_lane_f64(a, b, c, 0);
 }
 
-// LLVM-LABEL: @test_vfmad_laneq_f64(
-// LLVM: extractelement <2 x double>
-// LLVM: call double @llvm.fma.f64(
+// LLVM-LABEL: define dso_local double @test_vfmad_laneq_f64(
+// LLVM-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <2 x double> 
noundef [[C:%.*]]) #[[ATTR0]] {
 // CIR-LABEL: @test_vfmad_laneq_f64(
-// CIR: cir.vec.extract
-// CIR: cir.call_llvm_intrinsic "fma"
 float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
+  // CIR: cir.vec.extract
+  // CIR: cir.call_llvm_intrinsic "fma"
+
+  // LLVM: [[EXTRACT:%.*]] = extractelement <2 x double> [[C]], i32 1
+  // LLVM: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], double 
[[EXTRACT]], double [[A]])
+  // LLVM: ret double [[TMP0]]
   return vfmad_laneq_f64(a, b, c, 1);
 }

>From e2a0ed7c6c5a80e873e9f787655eecf800c11aaf Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Sun, 5 Apr 2026 07:47:42 +0300
Subject: [PATCH 4/4] fixup! [CIR][AArch64] Lower vfma lane builtins

Stop rejecting NeonTypeFlags::Float16 in getNeonType and keep the
required NEON vector types local to the vector-only lowering blocks,
which matches the current f16 support and avoids accidental shared-
setup dependencies.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 50 ++++++++++---------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index c972e9e12c430..cf6f2ba3d032b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -139,8 +139,6 @@ static cir::VectorType getNeonType(CIRGenFunction *cgf, 
NeonTypeFlags typeFlags,
       cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: BFloat16"));
     [[fallthrough]];
   case NeonTypeFlags::Float16:
-    if (!hasLegalHalfType)
-      cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Float16"));
     return cir::VectorType::get(cgf->getCIRGenModule().fP16Ty,
                                 v1Ty ? 1 : (4 << isQuad));
   case NeonTypeFlags::Int32:
@@ -627,6 +625,11 @@ static bool hasExtraNeonArgument(unsigned builtinID) {
   case ARM::BI__builtin_arm_vcvtr_d:
     mask = 1;
   }
+  switch (builtinID) {
+  default:
+    break;
+  }
+
   return mask != 0;
 }
 
@@ -2180,27 +2183,6 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return mlir::Value{};
   }
 
-  switch (builtinID) {
-  case NEON::BI__builtin_neon_vfmah_lane_f16:
-  case NEON::BI__builtin_neon_vfmas_lane_f32:
-  case NEON::BI__builtin_neon_vfmah_laneq_f16:
-  case NEON::BI__builtin_neon_vfmas_laneq_f32:
-  case NEON::BI__builtin_neon_vfmad_lane_f64:
-  case NEON::BI__builtin_neon_vfmad_laneq_f64: {
-    mlir::Value lane = cir::VecExtractOp::create(builder, loc, ops[2], ops[3]);
-    mlir::Type scalarTy = convertType(expr->getType());
-    llvm::SmallVector<mlir::Value> fmaOps = {ops[1], lane, ops[0]};
-    return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", scalarTy,
-                                           fmaOps);
-  }
-  default:
-    break;
-  }
-
-  cir::VectorType ty = getNeonType(this, type, loc);
-  if (!ty)
-    return nullptr;
-
   llvm::StringRef intrName;
 
   switch (builtinID) {
@@ -2212,6 +2194,10 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
   case NEON::BI__builtin_neon_vfmaq_lane_v:
   case NEON::BI__builtin_neon_vfma_laneq_v:
   case NEON::BI__builtin_neon_vfmaq_laneq_v: {
+    // Keep the NEON vector type local to each vector-only builtin block.
+    cir::VectorType ty = getNeonType(this, type, loc);
+    if (!ty)
+      return nullptr;
     mlir::Value addend = ops[0];
     mlir::Value multiplicand = ops[1];
     mlir::Value laneSource = ops[2];
@@ -2241,6 +2227,18 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     llvm::SmallVector<mlir::Value> fmaOps = {multiplicand, splat, addend};
     return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps);
   }
+  case NEON::BI__builtin_neon_vfmah_lane_f16:
+  case NEON::BI__builtin_neon_vfmas_lane_f32:
+  case NEON::BI__builtin_neon_vfmah_laneq_f16:
+  case NEON::BI__builtin_neon_vfmas_laneq_f32:
+  case NEON::BI__builtin_neon_vfmad_lane_f64:
+  case NEON::BI__builtin_neon_vfmad_laneq_f64: {
+    mlir::Value lane = cir::VecExtractOp::create(builder, loc, ops[2], ops[3]);
+    mlir::Type scalarTy = convertType(expr->getType());
+    llvm::SmallVector<mlir::Value> fmaOps = {ops[1], lane, ops[0]};
+    return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", scalarTy,
+                                           fmaOps);
+  }
   case NEON::BI__builtin_neon_vmull_v:
   case NEON::BI__builtin_neon_vmax_v:
   case NEON::BI__builtin_neon_vmaxq_v:
@@ -2253,11 +2251,15 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
                      getContext().BuiltinInfo.getName(builtinID));
     return mlir::Value{};
   case NEON::BI__builtin_neon_vabd_v:
-  case NEON::BI__builtin_neon_vabdq_v:
+  case NEON::BI__builtin_neon_vabdq_v: {
+    cir::VectorType ty = getNeonType(this, type, loc);
+    if (!ty)
+      return nullptr;
     intrName = usgn ? "aarch64.neon.uabd" : "aarch64.neon.sabd";
     if (cir::isFPOrVectorOfFPType(ty))
       intrName = "aarch64.neon.fabd";
     return emitNeonCall(cgm, builder, {ty, ty}, ops, intrName, ty, loc);
+  }
   case NEON::BI__builtin_neon_vpadal_v:
   case NEON::BI__builtin_neon_vpadalq_v:
   case NEON::BI__builtin_neon_vpmin_v:

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [CIR][AArch64] Lower vfma lane builtins (PR #188190)

Reply via email to