[clang] [CIR][AArch64] Lower vfma_v, vfma_lane_v, and vfmas lane builtins (PR #204819)

via cfe-commits Fri, 19 Jun 2026 05:58:16 -0700

llvmorg-github-actions[bot] wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-clangir

Author: Yair Ben Avraham (yairbenavraham)

<details>
<summary>Changes</summary>

Lower additional AArch64 NEON fused multiply-accumulate builtins in CIR.

This covers:
  - BI__builtin_neon_vfma_v: vfma_f16, vfma_f32, vfma_f64
  - BI__builtin_neon_vfma_lane_v: vfma_lane_f16, vfma_lane_f32, vfma_lane_f64
  - BI__builtin_neon_vfmas_lane_f32: vfmas_lane_f32
  - BI__builtin_neon_vfmas_laneq_f32: vfmas_laneq_f32

vfma_v and vfma_lane_v reuse the corresponding quad FMA lowering paths. The 
lane forms splat the selected vector element, while the scalar forms extract 
the selected lane before emitting llvm.fma.

Add direct LLVM, CIR-to-LLVM, and CIR coverage for every listed ACLE wrapper in 
the existing fused-multiply tests.

Part of #<!-- -->185382
Follow-up to #<!-- -->202337

---
Full diff: https://github.com/llvm/llvm-project/pull/204819.diff


3 Files Affected:

- (modified) clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp (+36-13) 
- (modified) clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c (+46-22) 
- (modified) clang/test/CodeGen/AArch64/neon/fused-multiply.c (+70) 


``````````diff
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index b52e978fa0f4d..8be4d567cefc5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -837,15 +837,15 @@ static mlir::Value emitCommonNeonBuiltinExpr(
   case NEON::BI__builtin_neon_vcvtx_f32_v:
   case NEON::BI__builtin_neon_vext_v:
   case NEON::BI__builtin_neon_vextq_v:
-  case NEON::BI__builtin_neon_vfma_v:
     cgf.cgm.errorNYI(expr->getSourceRange(),
                      std::string("unimplemented AArch64 builtin call: ") +
                          ctx.BuiltinInfo.getName(builtinID));
     return mlir::Value{};
+  case NEON::BI__builtin_neon_vfma_v:
   case NEON::BI__builtin_neon_vfmaq_v: {
-    // NEON intrinsic: vfmaq(accumulator, multiplicand1, multiplicand2)
+    // NEON intrinsic: vfma(q)(accumulator, multiplicand1, multiplicand2)
     // LLVM intrinsic: fma(multiplicand1, multiplicand2, accumulator)
-    // Reorder arguments to match LLVM fma signature
+    // Reorder arguments to match LLVM fma signature.
     mlir::Value op0 = cgf.getBuilder().createBitcast(ops[0], ty);
     mlir::Value op1 = cgf.getBuilder().createBitcast(ops[1], ty);
     mlir::Value op2 = cgf.getBuilder().createBitcast(ops[2], ty);
@@ -2744,17 +2744,16 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return builder.createBitcast(ops[0], ty);
   }
   case NEON::BI__builtin_neon_vfma_lane_v:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented AArch64 builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinID));
-    return mlir::Value{};
   case NEON::BI__builtin_neon_vfmaq_lane_v: {
     mlir::Value addend = builder.createBitcast(ops[0], ty);
     mlir::Value multiplicand = builder.createBitcast(ops[1], ty);
-    // The lane source operand is the non-quad vector, so it has half as many
-    // lanes as the quad result vector.
-    cir::VectorType sourceTy =
-        cir::VectorType::get(ty.getElementType(), ty.getSize() / 2);
+    // For vfmaq_lane, the lane source operand is the non-quad vector, so it 
has
+    // half as many lanes as the quad result vector. For vfma_lane, it has the
+    // same shape as the result vector.
+    cir::VectorType sourceTy = cir::VectorType::get(
+        ty.getElementType(), builtinID == NEON::BI__builtin_neon_vfmaq_lane_v
+                                 ? ty.getSize() / 2
+                                 : ty.getSize());
     mlir::Value laneSource = builder.createBitcast(ops[2], sourceTy);
     laneSource = emitNeonSplat(builder, loc, laneSource, ops[3], ty.getSize());
 
@@ -2804,9 +2803,33 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps);
   }
   case NEON::BI__builtin_neon_vfmah_lane_f16:
-  case NEON::BI__builtin_neon_vfmas_lane_f32:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
+  case NEON::BI__builtin_neon_vfmas_lane_f32: {
+    // Scalar lane/laneq forms use one selected element from the lane source.
+    mlir::Value laneSource = builder.createExtractElement(
+        loc, ops[2], static_cast<uint64_t>(getIntValueFromConstOp(ops[3])));
+
+    llvm::SmallVector<mlir::Value> fmaOps = {ops[1], laneSource, ops[0]};
+    return emitCallMaybeConstrainedBuiltin(
+        builder, loc, "fma", convertType(expr->getType()), fmaOps);
+  }
   case NEON::BI__builtin_neon_vfmah_laneq_f16:
-  case NEON::BI__builtin_neon_vfmas_laneq_f32:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
+  case NEON::BI__builtin_neon_vfmas_laneq_f32: {
+    // Scalar lane/laneq forms use one selected element from the lane source.
+    mlir::Value laneSource = builder.createExtractElement(
+        loc, ops[2], static_cast<uint64_t>(getIntValueFromConstOp(ops[3])));
+
+    llvm::SmallVector<mlir::Value> fmaOps = {ops[1], laneSource, ops[0]};
+    return emitCallMaybeConstrainedBuiltin(
+        builder, loc, "fma", convertType(expr->getType()), fmaOps);
+  }
   case NEON::BI__builtin_neon_vfmad_lane_f64:
     cgm.errorNYI(expr->getSourceRange(),
                  std::string("unimplemented AArch64 builtin call: ") +
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
index 1460fb3b2bae1..cd929ac88a1a3 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c
@@ -27,6 +27,17 @@
 // 2.6.1.9.3 Fused multiply-accumulate, vector quad forms
 //===------------------------------------------------------===//
 
+// LLVM-LABEL: @test_vfma_f16(
+// CIR-LABEL: @vfma_f16(
+float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<4 x !cir.f16>, !cir.vector<4 x !cir.f16>, !cir.vector<4 x 
!cir.f16>) -> !cir.vector<4 x !cir.f16>
+
+// LLVM-SAME: <4 x half> {{.*}} [[A:%.*]], <4 x half> {{.*}} [[B:%.*]], <4 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> {{.*}}, 
<4 x half> {{.*}}, <4 x half> {{.*}})
+// LLVM:      ret <4 x half> [[FMA]]
+  return vfma_f16(a, b, c);
+}
+
 // LLVM-LABEL: @test_vfmaq_f16(
 // CIR-LABEL: @vfmaq_f16(
 float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
@@ -47,6 +58,19 @@ float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, 
float16x8_t c) {
   return vfmaq_f16(a, b, c);
 }
 
+// ALL-LABEL: @test_vfma_lane_f16(
+float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b,
+                                float16x4_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x 
!cir.f16>) [#cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i, 
#cir.int<3> : !s32i] : !cir.vector<4 x !cir.f16>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<4 x !cir.f16>, !cir.vector<4 x !cir.f16>, !cir.vector<4 x 
!cir.f16>) -> !cir.vector<4 x !cir.f16>
+
+// LLVM-SAME: <4 x half> {{.*}} [[A:%.*]], <4 x half> {{.*}} [[B:%.*]], <4 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = shufflevector <4 x half> {{.*}}, <4 x half> 
{{.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// LLVM:      [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> {{.*}}, 
<4 x half> [[LANE]], <4 x half> {{.*}})
+// LLVM:      ret <4 x half> [[FMA]]
+  return vfma_lane_f16(a, b, c, 3);
+}
+
 // ALL-LABEL: @test_vfmaq_lane_f16(
 float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b,
                                  float16x4_t c) {
@@ -67,28 +91,6 @@ float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b,
   return vfmaq_lane_f16(a, b, c, 3);
 }
 
-// ALL-LABEL: @test_vfmaq_laneq_f16(
-float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b,
-                                  float16x8_t c) {
-// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.f16>) [#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, 
#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : 
!s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.f16>
-// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : 
(!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x 
!cir.f16>) -> !cir.vector<8 x !cir.f16>
-
-// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <8 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
-// LLVM:      [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
-// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
-// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
-// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8>
-// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8>
-// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8>
-// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <8 x half>
-// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <8 x half>
-// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half>
-// LLVM-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C_CAST]], <8 x half> 
{{.*}}, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// LLVM-NEXT: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> 
[[LANE]], <8 x half> [[B_CAST]], <8 x half> [[A_CAST]])
-// LLVM:      ret <8 x half> [[FMA]]
-  return vfmaq_laneq_f16(a, b, c, 7);
-}
-
 // ALL-LABEL: @test_vfma_laneq_f16(
 float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b,
                                  float16x8_t c) {
@@ -110,3 +112,25 @@ float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t 
b,
 // LLVM:      ret <4 x half> [[FMA]]
   return vfma_laneq_f16(a, b, c, 7);
 }
+
+// ALL-LABEL: @test_vfmaq_laneq_f16(
+float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b,
+                                  float16x8_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x 
!cir.f16>) [#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, 
#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : 
!s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.f16>
+// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : 
(!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x 
!cir.f16>) -> !cir.vector<8 x !cir.f16>
+
+// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <8 x 
half> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16>
+// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16>
+// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16>
+// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8>
+// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8>
+// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8>
+// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <8 x half>
+// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <8 x half>
+// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half>
+// LLVM-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C_CAST]], <8 x half> 
{{.*}}, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// LLVM-NEXT: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> 
[[LANE]], <8 x half> [[B_CAST]], <8 x half> [[A_CAST]])
+// LLVM:      ret <8 x half> [[FMA]]
+  return vfmaq_laneq_f16(a, b, c, 7);
+}
diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c 
b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
index 4e30b063064aa..f90e9642c64fe 100644
--- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c
+++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c
@@ -29,6 +29,28 @@
 // 2.1.1.2.5 Fused multiply-accumulate, vector quad forms
 //===------------------------------------------------------===//
 
+// LLVM-LABEL: @test_vfma_f32(
+// CIR-LABEL: @vfma_f32(
+float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>, !cir.vector<2 x 
!cir.float>) -> !cir.vector<2 x !cir.float>
+
+// LLVM-SAME: <2 x float> {{.*}} [[A:%.*]], <2 x float> {{.*}} [[B:%.*]], <2 x 
float> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
{{.*}}, <2 x float> {{.*}}, <2 x float> {{.*}})
+// LLVM:      ret <2 x float> [[FMA]]
+  return vfma_f32(a, b, c);
+}
+
+// LLVM-LABEL: @test_vfma_f64(
+// CIR-LABEL: @vfma_f64(
+float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : 
(!cir.vector<1 x !cir.double>, !cir.vector<1 x !cir.double>, !cir.vector<1 x 
!cir.double>) -> !cir.vector<1 x !cir.double>
+
+// LLVM-SAME: <1 x double> {{.*}} [[A:%.*]], <1 x double> {{.*}} [[B:%.*]], <1 
x double> {{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[FMA:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> 
{{.*}}, <1 x double> {{.*}}, <1 x double> {{.*}})
+// LLVM:      ret <1 x double> [[FMA]]
+  return vfma_f64(a, b, c);
+}
+
 // LLVM-LABEL: @test_vfmaq_f32(
 // CIR-LABEL: @vfmaq_f32(
 float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
@@ -69,6 +91,30 @@ float64x2_t test_vfmaq_f64(float64x2_t a, float64x2_t b, 
float64x2_t c) {
   return vfmaq_f64(a, b, c);
 }
 
+// ALL-LABEL: @test_vfma_lane_f32(
+float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x 
!cir.float>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x 
!cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>, !cir.vector<2 x 
!cir.float>) -> !cir.vector<2 x !cir.float>
+
+// LLVM-SAME: <2 x float> {{.*}} [[A:%.*]], <2 x float> {{.*}} [[B:%.*]], <2 x 
float> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = shufflevector <2 x float> {{.*}}, <2 x float> 
{{.*}}, <2 x i32> <i32 1, i32 1>
+// LLVM:      [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> 
{{.*}}, <2 x float> [[LANE]], <2 x float> {{.*}})
+// LLVM:      ret <2 x float> [[FMA]]
+  return vfma_lane_f32(a, b, v, 1);
+}
+
+// ALL-LABEL: @test_vfma_lane_f64(
+float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
+// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<1 x 
!cir.double>) [#cir.int<0> : !s32i] : !cir.vector<1 x !cir.double>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.vector<1 x !cir.double>, !cir.vector<1 x !cir.double>, !cir.vector<1 x 
!cir.double>) -> !cir.vector<1 x !cir.double>
+
+// LLVM-SAME: <1 x double> {{.*}} [[A:%.*]], <1 x double> {{.*}} [[B:%.*]], <1 
x double> {{.*}} [[V:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = shufflevector <1 x double> {{.*}}, <1 x double> 
{{.*}}, <1 x i32> zeroinitializer
+// LLVM:      [[FMA:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> 
{{.*}}, <1 x double> [[LANE]], <1 x double> {{.*}})
+// LLVM:      ret <1 x double> [[FMA]]
+  return vfma_lane_f64(a, b, v, 0);
+}
+
 // ALL-LABEL: @test_vfmaq_lane_f32(
 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
 // CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x 
!cir.float>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, 
#cir.int<1> : !s32i] : !cir.vector<4 x !cir.float>
@@ -264,6 +310,30 @@ float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, 
float64x2_t b,
   return vfmaq_laneq_f64(a, b, v, 0);
 }
 
+// ALL-LABEL: @test_vfmas_lane_f32(
+float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : 
!cir.vector<2 x !cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.float, !cir.float, !cir.float) -> !cir.float
+
+// LLVM-SAME: float {{.*}} [[A:%.*]], float {{.*}} [[B:%.*]], <2 x float> 
{{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = extractelement <2 x float> [[C]], i{{32|64}} 1
+// LLVM:      [[FMA:%.*]] = call float @llvm.fma.f32(float [[B]], float 
[[LANE]], float [[A]])
+// LLVM:      ret float [[FMA]]
+  return vfmas_lane_f32(a, b, c, 1);
+}
+
+// ALL-LABEL: @test_vfmas_laneq_f32(
+float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t c) {
+// CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : 
!cir.vector<4 x !cir.float>
+// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : 
(!cir.float, !cir.float, !cir.float) -> !cir.float
+
+// LLVM-SAME: float {{.*}} [[A:%.*]], float {{.*}} [[B:%.*]], <4 x float> 
{{.*}} [[C:%.*]]) {{.*}} {
+// LLVM:      [[LANE:%.*]] = extractelement <4 x float> [[C]], i{{32|64}} 3
+// LLVM:      [[FMA:%.*]] = call float @llvm.fma.f32(float [[B]], float 
[[LANE]], float [[A]])
+// LLVM:      ret float [[FMA]]
+  return vfmas_laneq_f32(a, b, c, 3);
+}
+
 // ALL-LABEL: @test_vfmad_laneq_f64(
 float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
 // CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : 
!cir.vector<2 x !cir.double>

``````````

</details>


https://github.com/llvm/llvm-project/pull/204819
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [CIR][AArch64] Lower vfma_v, vfma_lane_v, and vfmas lane builtins (PR #204819)

Reply via email to