[clang] [CIR][AArch64] Lower vfma lane builtins (PR #188190)

Yair Ben Avraham via cfe-commits Tue, 24 Mar 2026 00:32:22 -0700

https://github.com/yairbenavraham created 
https://github.com/llvm/llvm-project/pull/188190


This PR implements the AArch64 NEON ClangIR lowering for the vfma lane/laneq 
builtins and adds CIR-enabled regression tests.

Covered scope:
  - vector lane/laneq forms
  - scalar lane/laneq forms
  - includes the vfmaq_laneq_v family called out in #185382

Validation:
  - clean build from scratch
  - post-build sanity check
  - focused llvm-lit validation for the touched AArch64 NEON tests

Part of #185382 

>From 3aa2a1dcd459df3235b33430e02d99d9f76fe00d Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Sun, 22 Mar 2026 04:57:07 +0200
Subject: [PATCH 1/3] [CIR] Fix generated type constraint header dependencies

Add the missing MLIRCIRTypeConstraintsIncGen dependencies in the
CIR dialect and lowering CMake targets so clean CIR-enabled builds
generate the required headers before the lowering libraries are
compiled.
---
 clang/include/clang/CIR/Dialect/IR/CMakeLists.txt  | 2 +-
 clang/lib/CIR/Lowering/CMakeLists.txt              | 3 +++
 clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt 
b/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt
index 870f9e3f5d052..1388e5bc612f2 100644
--- a/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt
+++ b/clang/include/clang/CIR/Dialect/IR/CMakeLists.txt
@@ -27,5 +27,5 @@ clang_tablegen(CIRLowering.inc -gen-cir-lowering
 set(LLVM_TARGET_DEFINITIONS CIRTypeConstraints.td)
 mlir_tablegen(CIRTypeConstraints.h.inc -gen-type-constraint-decls)
 mlir_tablegen(CIRTypeConstraints.cpp.inc -gen-type-constraint-defs)
-add_public_tablegen_target(MLIRCIRTypeConstraintsIncGen)
+add_mlir_generic_tablegen_target(MLIRCIRTypeConstraintsIncGen)
 add_dependencies(mlir-headers MLIRCIRTypeConstraintsIncGen)
diff --git a/clang/lib/CIR/Lowering/CMakeLists.txt 
b/clang/lib/CIR/Lowering/CMakeLists.txt
index 28ec3c551018c..77d28ef72d11d 100644
--- a/clang/lib/CIR/Lowering/CMakeLists.txt
+++ b/clang/lib/CIR/Lowering/CMakeLists.txt
@@ -9,6 +9,9 @@ add_clang_library(clangCIRLoweringCommon
   CIRPasses.cpp
   LoweringHelpers.cpp
 
+  DEPENDS
+  MLIRCIRTypeConstraintsIncGen
+
   LINK_LIBS
   clangCIR
   ${dialect_libs}
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt 
b/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
index c7467fe40ba30..5b197ddca12c0 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
@@ -13,6 +13,7 @@ add_clang_library(clangCIRLoweringDirectToLLVM
   MLIRCIREnumsGen
   MLIRCIROpsIncGen
   MLIRCIROpInterfacesIncGen
+  MLIRCIRTypeConstraintsIncGen
 
   LINK_LIBS
   clangCIRLoweringCommon

>From edf952a72486ae41b7720f92067638ee76b09251 Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Mon, 23 Mar 2026 17:55:19 +0200
Subject: [PATCH 2/3] [CIR][AArch64] Lower vfma lane builtins

Lower the AArch64 vfma lane and laneq builtins in CIR codegen.

This adds handling for the vector and scalar vfma lane forms,
including the vfmaq_laneq_v family called out in the issue, and
keeps the CIR builtin structure aligned with the existing AArch64
builtin lowering pattern.

The patch also includes the required formatting adjustment so the
implementation matches the repository clang-format style.
---
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  | 67 +++++++++++++++----
 1 file changed, 53 insertions(+), 14 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 5d7b8d839fa84..26560b2ab3447 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -801,11 +801,10 @@ static cir::VectorType getNeonType(CIRGenFunction *cgf, 
NeonTypeFlags typeFlags,
       cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: BFloat16"));
     [[fallthrough]];
   case NeonTypeFlags::Float16:
-    if (hasLegalHalfType)
+    if (!hasLegalHalfType)
       cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Float16"));
-    else
-      cgf->getCIRGenModule().errorNYI(loc, std::string("NEON type: Float16"));
-    [[fallthrough]];
+    return cir::VectorType::get(cgf->getCIRGenModule().fP16Ty,
+                                v1Ty ? 1 : (4 << isQuad));
   case NeonTypeFlags::Int32:
     return cir::VectorType::get(typeFlags.isUnsigned() ? cgf->uInt32Ty
                                                        : cgf->sInt32Ty,
@@ -2848,6 +2847,23 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return mlir::Value{};
   }
 
+  switch (builtinID) {
+  case NEON::BI__builtin_neon_vfmah_lane_f16:
+  case NEON::BI__builtin_neon_vfmas_lane_f32:
+  case NEON::BI__builtin_neon_vfmah_laneq_f16:
+  case NEON::BI__builtin_neon_vfmas_laneq_f32:
+  case NEON::BI__builtin_neon_vfmad_lane_f64:
+  case NEON::BI__builtin_neon_vfmad_laneq_f64: {
+    mlir::Value lane = cir::VecExtractOp::create(builder, loc, ops[2], ops[3]);
+    mlir::Type scalarTy = convertType(expr->getType());
+    llvm::SmallVector<mlir::Value> fmaOps = {ops[1], lane, ops[0]};
+    return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", scalarTy,
+                                           fmaOps);
+  }
+  default:
+    break;
+  }
+
   cir::VectorType ty = getNeonType(this, type, loc);
   if (!ty)
     return nullptr;
@@ -2859,16 +2875,6 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return std::nullopt;
   case NEON::BI__builtin_neon_vbsl_v:
   case NEON::BI__builtin_neon_vbslq_v:
-  case NEON::BI__builtin_neon_vfma_lane_v:
-  case NEON::BI__builtin_neon_vfmaq_lane_v:
-  case NEON::BI__builtin_neon_vfma_laneq_v:
-  case NEON::BI__builtin_neon_vfmaq_laneq_v:
-  case NEON::BI__builtin_neon_vfmah_lane_f16:
-  case NEON::BI__builtin_neon_vfmas_lane_f32:
-  case NEON::BI__builtin_neon_vfmah_laneq_f16:
-  case NEON::BI__builtin_neon_vfmas_laneq_f32:
-  case NEON::BI__builtin_neon_vfmad_lane_f64:
-  case NEON::BI__builtin_neon_vfmad_laneq_f64:
   case NEON::BI__builtin_neon_vmull_v:
   case NEON::BI__builtin_neon_vmax_v:
   case NEON::BI__builtin_neon_vmaxq_v:
@@ -2886,6 +2892,39 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     if (cir::isFPOrVectorOfFPType(ty))
       intrName = "aarch64.neon.fabd";
     return emitNeonCall(cgm, builder, {ty, ty}, ops, intrName, ty, loc);
+  case NEON::BI__builtin_neon_vfma_lane_v:
+  case NEON::BI__builtin_neon_vfmaq_lane_v:
+  case NEON::BI__builtin_neon_vfma_laneq_v:
+  case NEON::BI__builtin_neon_vfmaq_laneq_v: {
+    mlir::Value addend = ops[0];
+    mlir::Value multiplicand = ops[1];
+    mlir::Value laneSource = ops[2];
+    auto vecTy = mlir::cast<cir::VectorType>(ty);
+    auto elemTy = vecTy.getElementType();
+    auto numElts = vecTy.getSize();
+
+    if (addend.getType() != ty)
+      addend = builder.createBitcast(loc, addend, ty);
+    if (multiplicand.getType() != ty)
+      multiplicand = builder.createBitcast(loc, multiplicand, ty);
+
+    cir::VectorType sourceTy = ty;
+    if (builtinID == NEON::BI__builtin_neon_vfmaq_lane_v)
+      sourceTy = cir::VectorType::get(elemTy, numElts / 2);
+    else if (builtinID == NEON::BI__builtin_neon_vfma_laneq_v)
+      sourceTy = cir::VectorType::get(elemTy, numElts * 2);
+
+    if (laneSource.getType() != sourceTy)
+      laneSource = builder.createBitcast(loc, laneSource, sourceTy);
+
+    int64_t lane =
+        expr->getArg(3)->EvaluateKnownConstInt(getContext()).getSExtValue();
+    llvm::SmallVector<int64_t> mask(numElts, lane);
+    mlir::Value splat = builder.createVecShuffle(loc, laneSource, mask);
+
+    llvm::SmallVector<mlir::Value> fmaOps = {multiplicand, splat, addend};
+    return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps);
+  }
   case NEON::BI__builtin_neon_vpadal_v:
   case NEON::BI__builtin_neon_vpadalq_v:
   case NEON::BI__builtin_neon_vpmin_v:

>From a7e2f678ea07679d73974bf3aa200f3248a7c1db Mon Sep 17 00:00:00 2001
From: Yair Ben Avraham <[email protected]>
Date: Mon, 23 Mar 2026 17:55:19 +0200
Subject: [PATCH 3/3] [CIR][AArch64] Add vfma lane tests

Add focused AArch64 NEON tests for the vfma lane and laneq builtins.

The tests cover the vector and scalar forms used by this patch series
and are placed under clang/test/CodeGen/AArch64/neon for CIR-enabled
validation.
---
 clang/test/CodeGen/AArch64/neon/vfma-lane.c   | 136 ++++++++++++++++++
 .../CodeGen/AArch64/neon/vfma-scalar-lane.c   |  77 ++++++++++
 2 files changed, 213 insertions(+)
 create mode 100644 clang/test/CodeGen/AArch64/neon/vfma-lane.c
 create mode 100644 clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c

diff --git a/clang/test/CodeGen/AArch64/neon/vfma-lane.c 
b/clang/test/CodeGen/AArch64/neon/vfma-lane.c
new file mode 100644
index 0000000000000..955ab411793b9
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/neon/vfma-lane.c
@@ -0,0 +1,136 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
+// RUN:   -target-feature +fullfp16 -disable-O0-optnone \
+// RUN:   -flax-vector-conversions=none -emit-llvm -o - %s | \
+// RUN:   opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefix=LLVM
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -target-feature +neon -target-feature +fullfp16 \
+// RUN:   -disable-O0-optnone -flax-vector-conversions=none \
+// RUN:   -fclangir -emit-llvm -o - %s | \
+// RUN:   opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefix=LLVM %}
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -target-feature +neon -target-feature +fullfp16 \
+// RUN:   -disable-O0-optnone -flax-vector-conversions=none \
+// RUN:   -fclangir -emit-cir -o - %s | FileCheck %s --check-prefix=CIR %}
+
+#include <arm_neon.h>
+
+// LLVM-LABEL: @test_vfma_lane_f16(
+// LLVM: shufflevector <4 x half>
+// LLVM: call <4 x half> @llvm.fma.v4f16(
+// CIR-LABEL: @test_vfma_lane_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
+  return vfma_lane_f16(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfmaq_lane_f16(
+// LLVM: shufflevector <4 x half>
+// LLVM: call <8 x half> @llvm.fma.v8f16(
+// CIR-LABEL: @test_vfmaq_lane_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
+  return vfmaq_lane_f16(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfma_laneq_f16(
+// LLVM: shufflevector <8 x half>
+// LLVM: call <4 x half> @llvm.fma.v4f16(
+// CIR-LABEL: @test_vfma_laneq_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
+  return vfma_laneq_f16(a, b, c, 7);
+}
+
+// LLVM-LABEL: @test_vfmaq_laneq_f16(
+// LLVM: shufflevector <8 x half>
+// LLVM: call <8 x half> @llvm.fma.v8f16(
+// CIR-LABEL: @test_vfmaq_laneq_f16(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
+  return vfmaq_laneq_f16(a, b, c, 7);
+}
+
+// LLVM-LABEL: @test_vfma_lane_f32(
+// LLVM: shufflevector <2 x float>
+// LLVM: call <2 x float> @llvm.fma.v2f32(
+// CIR-LABEL: @test_vfma_lane_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
+  return vfma_lane_f32(a, b, v, 1);
+}
+
+// LLVM-LABEL: @test_vfmaq_lane_f32(
+// LLVM: shufflevector <2 x float>
+// LLVM: call <4 x float> @llvm.fma.v4f32(
+// CIR-LABEL: @test_vfmaq_lane_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
+  return vfmaq_lane_f32(a, b, v, 1);
+}
+
+// LLVM-LABEL: @test_vfma_laneq_f32(
+// LLVM: shufflevector <4 x float>
+// LLVM: call <2 x float> @llvm.fma.v2f32(
+// CIR-LABEL: @test_vfma_laneq_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
+  return vfma_laneq_f32(a, b, v, 3);
+}
+
+// LLVM-LABEL: @test_vfmaq_laneq_f32(
+// LLVM: shufflevector <4 x float>
+// LLVM: call <4 x float> @llvm.fma.v4f32(
+// CIR-LABEL: @test_vfmaq_laneq_f32(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
+  return vfmaq_laneq_f32(a, b, v, 3);
+}
+
+// LLVM-LABEL: @test_vfma_lane_f64(
+// LLVM: shufflevector <1 x double>
+// LLVM: call <1 x double> @llvm.fma.v1f64(
+// CIR-LABEL: @test_vfma_lane_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
+  return vfma_lane_f64(a, b, v, 0);
+}
+
+// LLVM-LABEL: @test_vfmaq_lane_f64(
+// LLVM: shufflevector <1 x double>
+// LLVM: call <2 x double> @llvm.fma.v2f64(
+// CIR-LABEL: @test_vfmaq_lane_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
+  return vfmaq_lane_f64(a, b, v, 0);
+}
+
+// LLVM-LABEL: @test_vfma_laneq_f64(
+// LLVM: @llvm.fma
+// CIR-LABEL: @test_vfma_laneq_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
+  return vfma_laneq_f64(a, b, v, 0);
+}
+
+// LLVM-LABEL: @test_vfmaq_laneq_f64(
+// LLVM: shufflevector <2 x double>
+// LLVM: call <2 x double> @llvm.fma.v2f64(
+// CIR-LABEL: @test_vfmaq_laneq_f64(
+// CIR: cir.vec.shuffle
+// CIR: cir.call_llvm_intrinsic "fma"
+float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
+  return vfmaq_laneq_f64(a, b, v, 1);
+}
diff --git a/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c 
b/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c
new file mode 100644
index 0000000000000..53fc9761e01a0
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/neon/vfma-scalar-lane.c
@@ -0,0 +1,77 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
+// RUN:   -target-feature +fullfp16 -disable-O0-optnone \
+// RUN:   -flax-vector-conversions=none -emit-llvm -o - %s | \
+// RUN:   opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefix=LLVM
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -target-feature +neon -target-feature +fullfp16 \
+// RUN:   -disable-O0-optnone -flax-vector-conversions=none \
+// RUN:   -fclangir -emit-llvm -o - %s | \
+// RUN:   opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefix=LLVM %}
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu \
+// RUN:   -target-feature +neon -target-feature +fullfp16 \
+// RUN:   -disable-O0-optnone -flax-vector-conversions=none \
+// RUN:   -fclangir -emit-cir -o - %s | FileCheck %s --check-prefix=CIR %}
+
+#include <arm_neon.h>
+
+// LLVM-LABEL: @test_vfmah_lane_f16(
+// LLVM: extractelement <4 x half>
+// LLVM: call half @llvm.fma.f16(
+// CIR-LABEL: @test_vfmah_lane_f16(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) {
+  return vfmah_lane_f16(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfmah_laneq_f16(
+// LLVM: extractelement <8 x half>
+// LLVM: call half @llvm.fma.f16(
+// CIR-LABEL: @test_vfmah_laneq_f16(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
+  return vfmah_laneq_f16(a, b, c, 7);
+}
+
+// LLVM-LABEL: @test_vfmas_lane_f32(
+// LLVM: extractelement <2 x float>
+// LLVM: call float @llvm.fma.f32(
+// CIR-LABEL: @test_vfmas_lane_f32(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
+  return vfmas_lane_f32(a, b, c, 1);
+}
+
+// LLVM-LABEL: @test_vfmas_laneq_f32(
+// LLVM: extractelement <4 x float>
+// LLVM: call float @llvm.fma.f32(
+// CIR-LABEL: @test_vfmas_laneq_f32(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t c) {
+  return vfmas_laneq_f32(a, b, c, 3);
+}
+
+// LLVM-LABEL: @test_vfmad_lane_f64(
+// LLVM: extractelement <1 x double>
+// LLVM: call double @llvm.fma.f64(
+// CIR-LABEL: @test_vfmad_lane_f64(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
+  return vfmad_lane_f64(a, b, c, 0);
+}
+
+// LLVM-LABEL: @test_vfmad_laneq_f64(
+// LLVM: extractelement <2 x double>
+// LLVM: call double @llvm.fma.f64(
+// CIR-LABEL: @test_vfmad_laneq_f64(
+// CIR: cir.vec.extract
+// CIR: cir.call_llvm_intrinsic "fma"
+float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
+  return vfmad_laneq_f64(a, b, c, 1);
+}

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [CIR][AArch64] Lower vfma lane builtins (PR #188190)

Reply via email to