https://github.com/yairbenavraham updated https://github.com/llvm/llvm-project/pull/195602
>From 5047649698cd34eaa6e824405782991d66bf2234 Mon Sep 17 00:00:00 2001 From: Yair Ben Avraham <[email protected]> Date: Wed, 29 Apr 2026 19:24:37 +0300 Subject: [PATCH 1/4] [CIR][AArch64] Lower vfmaq_v f32/f64 Lower BI__builtin_neon_vfmaq_v for the vfmaq_f32 and vfmaq_f64 wrappers through the LLVM fma intrinsic. Keep vfma_v and vfmaq_f16 outside this focused split. Move the replaced vfmaq_f32 and vfmaq_f64 tests into neon/vfmaq.c with CIR coverage. --- .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 23 ++++++- clang/test/CodeGen/AArch64/neon-intrinsics.c | 38 ----------- clang/test/CodeGen/AArch64/neon/vfmaq.c | 65 +++++++++++++++++++ 3 files changed, 87 insertions(+), 39 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/neon/vfmaq.c diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp index 834f66586833b..349d6c837af12 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp @@ -501,6 +501,7 @@ static mlir::Value emitCommonNeonBuiltinExpr( // Determine the type of this overloaded NEON intrinsic. NeonTypeFlags neonType(neonTypeConst->getZExtValue()); + const bool isUnsigned = neonType.isUnsigned(); const bool hasLegalHalfType = cgf.getTarget().hasFastHalfType(); const bool usgn = neonType.isUnsigned(); @@ -677,7 +678,20 @@ static mlir::Value emitCommonNeonBuiltinExpr( case NEON::BI__builtin_neon_vext_v: case NEON::BI__builtin_neon_vextq_v: case NEON::BI__builtin_neon_vfma_v: - case NEON::BI__builtin_neon_vfmaq_v: + cgf.cgm.errorNYI(expr->getSourceRange(), + std::string("unimplemented AArch64 builtin call: ") + + ctx.BuiltinInfo.getName(builtinID)); + return mlir::Value{}; + case NEON::BI__builtin_neon_vfmaq_v: { + mlir::Value op0 = cgf.getBuilder().createBitcast(ops[0], ty); + mlir::Value op1 = cgf.getBuilder().createBitcast(ops[1], ty); + mlir::Value op2 = cgf.getBuilder().createBitcast(ops[2], ty); + llvm::SmallVector<mlir::Value> fmaOps = {op1, op2, op0}; + return cir::LLVMIntrinsicCallOp::create( + cgf.getBuilder(), loc, cgf.getBuilder().getStringAttr("fma"), + ty, fmaOps) + .getResult(); + } case NEON::BI__builtin_neon_vld1_v: case NEON::BI__builtin_neon_vld1q_v: case NEON::BI__builtin_neon_vld1_x2_v: @@ -2092,6 +2106,13 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr, return mlir::Value{}; } + if (builtinID == NEON::BI__builtin_neon_vfmaq_f16) { + cgm.errorNYI(expr->getSourceRange(), + std::string("unimplemented AArch64 builtin call: ") + + getContext().BuiltinInfo.getName(builtinID)); + return mlir::Value{}; + } + // Handle MSVC intrinsics before argument evaluation to prevent double // evaluation. assert(!cir::MissingFeatures::msvcBuiltins()); diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c b/clang/test/CodeGen/AArch64/neon-intrinsics.c index 784d9624823d5..64bbf3e90d675 100644 --- a/clang/test/CodeGen/AArch64/neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c @@ -890,44 +890,6 @@ float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { return vfma_f32(v1, v2, v3); } -// CHECK-LABEL: define dso_local <4 x float> @test_vfmaq_f32( -// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V3]] to <4 x i32> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> -// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP6]]) -// CHECK-NEXT: ret <4 x float> [[TMP9]] -// -float32x4_t test_vfmaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { - return vfmaq_f32(v1, v2, v3); -} - -// CHECK-LABEL: define dso_local <2 x double> @test_vfmaq_f64( -// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V3]] to <2 x i64> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> -// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> -// CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x double> [[TMP6]]) -// CHECK-NEXT: ret <2 x double> [[TMP9]] -// -float64x2_t test_vfmaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { - return vfmaq_f64(v1, v2, v3); -} - // CHECK-LABEL: define dso_local <2 x float> @test_vfms_f32( // CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] diff --git a/clang/test/CodeGen/AArch64/neon/vfmaq.c b/clang/test/CodeGen/AArch64/neon/vfmaq.c new file mode 100644 index 0000000000000..54bc9d1a2cc5c --- /dev/null +++ b/clang/test/CodeGen/AArch64/neon/vfmaq.c @@ -0,0 +1,65 @@ +// REQUIRES: aarch64-registered-target || arm-registered-target + +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM +// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM %} +// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-cir -o - %s | FileCheck %s --check-prefixes=CIR %} + +//============================================================================= +// NOTES +// +// This file contains tests that were originally located in: +// * clang/test/CodeGen/AArch64/neon-intrinsics.c +// The main difference is the use of RUN lines that enable ClangIR lowering. +// This file currently covers the f32/f64 wrappers that lower through +// BI__builtin_neon_vfmaq_v. +// +// ACLE section headings based on v2025Q2 of the ACLE specification: +// * https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#fused-multiply-accumulate +// +//============================================================================= + +#include <arm_neon.h> + +//===------------------------------------------------------===// +// Fused multiply-accumulate, vector quad forms +//===------------------------------------------------------===// + +// CIR-LABEL: @vfmaq_f32( +// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>) -> !cir.vector<4 x !cir.float> + +// LLVM-LABEL: @test_vfmaq_f32( +float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { +// LLVM-SAME: <4 x float> {{.*}} [[A:%.*]], <4 x float> {{.*}} [[B:%.*]], <4 x float> {{.*}} [[C:%.*]]) {{.*}} { +// LLVM: [[A_I:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// LLVM-NEXT: [[C_I:%.*]] = bitcast <4 x float> [[C]] to <4 x i32> +// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <4 x i32> [[A_I]] to <16 x i8> +// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <4 x i32> [[B_I]] to <16 x i8> +// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <4 x i32> [[C_I]] to <16 x i8> +// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <4 x float> +// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <4 x float> +// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <4 x float> +// LLVM-NEXT: [[FMA:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B_CAST]], <4 x float> [[C_CAST]], <4 x float> [[A_CAST]]) +// LLVM-NEXT: ret <4 x float> [[FMA]] + return vfmaq_f32(a, b, c); +} + +// CIR-LABEL: @vfmaq_f64( +// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>) -> !cir.vector<2 x !cir.double> + +// LLVM-LABEL: @test_vfmaq_f64( +float64x2_t test_vfmaq_f64(float64x2_t a, float64x2_t b, float64x2_t c) { +// LLVM-SAME: <2 x double> {{.*}} [[A:%.*]], <2 x double> {{.*}} [[B:%.*]], <2 x double> {{.*}} [[C:%.*]]) {{.*}} { +// LLVM: [[A_I:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// LLVM-NEXT: [[C_I:%.*]] = bitcast <2 x double> [[C]] to <2 x i64> +// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <2 x i64> [[A_I]] to <16 x i8> +// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <2 x i64> [[B_I]] to <16 x i8> +// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <2 x i64> [[C_I]] to <16 x i8> +// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <2 x double> +// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <2 x double> +// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <2 x double> +// LLVM-NEXT: [[FMA:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[B_CAST]], <2 x double> [[C_CAST]], <2 x double> [[A_CAST]]) +// LLVM-NEXT: ret <2 x double> [[FMA]] + return vfmaq_f64(a, b, c); +} >From bbf5ccdb53b77b58cbe8b68bf6a66915412f484b Mon Sep 17 00:00:00 2001 From: Yair Ben Avraham <[email protected]> Date: Mon, 4 May 2026 15:05:24 +0300 Subject: [PATCH 2/4] [CIR][AArch64] Format vfmaq_v lowering --- clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp index 349d6c837af12..6fc63c46e949d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp @@ -688,8 +688,8 @@ static mlir::Value emitCommonNeonBuiltinExpr( mlir::Value op2 = cgf.getBuilder().createBitcast(ops[2], ty); llvm::SmallVector<mlir::Value> fmaOps = {op1, op2, op0}; return cir::LLVMIntrinsicCallOp::create( - cgf.getBuilder(), loc, cgf.getBuilder().getStringAttr("fma"), - ty, fmaOps) + cgf.getBuilder(), loc, cgf.getBuilder().getStringAttr("fma"), ty, + fmaOps) .getResult(); } case NEON::BI__builtin_neon_vld1_v: >From 7cb2f9673150a2c410cda037bfbe307be44417c8 Mon Sep 17 00:00:00 2001 From: Yair Ben Avraham <[email protected]> Date: Mon, 4 May 2026 22:40:26 +0300 Subject: [PATCH 3/4] [CIR][AArch64] Refine vfmaq_v lowering Move the helper definition above the first use. --- .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 32 +++++++++---------- clang/test/CodeGen/AArch64/neon/vfmaq.c | 11 ++++--- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp index 6fc63c46e949d..50b9f8020235e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp @@ -85,6 +85,17 @@ findARMVectorIntrinsicInMap(ArrayRef<ARMVectorIntrinsicInfo> intrinsicMap, //===----------------------------------------------------------------------===// // Generic helpers //===----------------------------------------------------------------------===// +// Emit an intrinsic where all operands are of the same type as the result. +// Depending on mode, this may be a constrained floating-point intrinsic. +static mlir::Value +emitCallMaybeConstrainedBuiltin(CIRGenBuilderTy &builder, mlir::Location loc, + StringRef intrName, mlir::Type retTy, + llvm::SmallVector<mlir::Value> &ops) { + assert(!cir::MissingFeatures::emitConstrainedFPCall()); + + return builder.emitIntrinsicCallOp(loc, intrName, retTy, ops); +} + static llvm::StringRef getLLVMIntrNameNoPrefix(llvm::Intrinsic::ID intrID) { llvm::StringRef llvmIntrName = llvm::Intrinsic::getBaseName(intrID); assert(llvmIntrName.starts_with("llvm.") && "Not an LLVM intrinsic!"); @@ -501,7 +512,6 @@ static mlir::Value emitCommonNeonBuiltinExpr( // Determine the type of this overloaded NEON intrinsic. NeonTypeFlags neonType(neonTypeConst->getZExtValue()); - const bool isUnsigned = neonType.isUnsigned(); const bool hasLegalHalfType = cgf.getTarget().hasFastHalfType(); const bool usgn = neonType.isUnsigned(); @@ -683,14 +693,15 @@ static mlir::Value emitCommonNeonBuiltinExpr( ctx.BuiltinInfo.getName(builtinID)); return mlir::Value{}; case NEON::BI__builtin_neon_vfmaq_v: { + // NEON intrinsic: vfmaq(accumulator, multiplicand1, multiplicand2) + // LLVM intrinsic: fma(multiplicand1, multiplicand2, accumulator) + // Reorder arguments to match LLVM fma signature mlir::Value op0 = cgf.getBuilder().createBitcast(ops[0], ty); mlir::Value op1 = cgf.getBuilder().createBitcast(ops[1], ty); mlir::Value op2 = cgf.getBuilder().createBitcast(ops[2], ty); llvm::SmallVector<mlir::Value> fmaOps = {op1, op2, op0}; - return cir::LLVMIntrinsicCallOp::create( - cgf.getBuilder(), loc, cgf.getBuilder().getStringAttr("fma"), ty, - fmaOps) - .getResult(); + return emitCallMaybeConstrainedBuiltin(cgf.getBuilder(), loc, "fma", ty, + fmaOps); } case NEON::BI__builtin_neon_vld1_v: case NEON::BI__builtin_neon_vld1q_v: @@ -876,17 +887,6 @@ static mlir::Value emitCommonNeonBuiltinExpr( } } -// Emit an intrinsic where all operands are of the same type as the result. -// Depending on mode, this may be a constrained floating-point intrinsic. -static mlir::Value -emitCallMaybeConstrainedBuiltin(CIRGenBuilderTy &builder, mlir::Location loc, - StringRef intrName, mlir::Type retTy, - llvm::SmallVector<mlir::Value> &ops) { - assert(!cir::MissingFeatures::emitConstrainedFPCall()); - - return builder.emitIntrinsicCallOp(loc, intrName, retTy, ops); -} - bool CIRGenFunction::getAArch64SVEProcessedOperands( unsigned builtinID, const CallExpr *expr, SmallVectorImpl<mlir::Value> &ops, SVETypeFlags typeFlags) { diff --git a/clang/test/CodeGen/AArch64/neon/vfmaq.c b/clang/test/CodeGen/AArch64/neon/vfmaq.c index 54bc9d1a2cc5c..1c05de703dace 100644 --- a/clang/test/CodeGen/AArch64/neon/vfmaq.c +++ b/clang/test/CodeGen/AArch64/neon/vfmaq.c @@ -21,14 +21,15 @@ #include <arm_neon.h> //===------------------------------------------------------===// -// Fused multiply-accumulate, vector quad forms +// 2.6.1.9.3 Fused multiply-accumulate, vector quad forms //===------------------------------------------------------===// -// CIR-LABEL: @vfmaq_f32( -// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>) -> !cir.vector<4 x !cir.float> // LLVM-LABEL: @test_vfmaq_f32( +// CIR-LABEL: @vfmaq_f32( float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { +// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>, !cir.vector<4 x !cir.float>) -> !cir.vector<4 x !cir.float> + // LLVM-SAME: <4 x float> {{.*}} [[A:%.*]], <4 x float> {{.*}} [[B:%.*]], <4 x float> {{.*}} [[C:%.*]]) {{.*}} { // LLVM: [[A_I:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> // LLVM-NEXT: [[B_I:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> @@ -44,11 +45,11 @@ float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { return vfmaq_f32(a, b, c); } +// LLVM-LABEL: @test_vfmaq_f64( // CIR-LABEL: @vfmaq_f64( +float64x2_t test_vfmaq_f64(float64x2_t a, float64x2_t b, float64x2_t c) { // CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>, !cir.vector<2 x !cir.double>) -> !cir.vector<2 x !cir.double> -// LLVM-LABEL: @test_vfmaq_f64( -float64x2_t test_vfmaq_f64(float64x2_t a, float64x2_t b, float64x2_t c) { // LLVM-SAME: <2 x double> {{.*}} [[A:%.*]], <2 x double> {{.*}} [[B:%.*]], <2 x double> {{.*}} [[C:%.*]]) {{.*}} { // LLVM: [[A_I:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> // LLVM-NEXT: [[B_I:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> >From 578b3160b1dbadfc4bd19f47490bf3df6e6c5567 Mon Sep 17 00:00:00 2001 From: Yair Ben Avraham <[email protected]> Date: Thu, 7 May 2026 18:23:26 +0300 Subject: [PATCH 4/4] [clang][test] Add %clang_cc1_cg_arm64_neon substitution Move the vfmaq coverage to fused-multiply.c and use the shared %clang_cc1_cg_arm64_neon test substitution for LLVM and CIR checks. Add vfmaq_f16 under +fullfp16, drop its temporary CIR NYI guard, and remove the duplicated non-constrained legacy check. The constrained-FP coverage remains. --- .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 7 ----- .../neon/{vfmaq.c => fused-multiply.c} | 30 ++++++++++++++++--- .../CodeGen/AArch64/v8.2a-neon-intrinsics.c | 19 ------------ 3 files changed, 26 insertions(+), 30 deletions(-) rename clang/test/CodeGen/AArch64/neon/{vfmaq.c => fused-multiply.c} (63%) diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp index 50b9f8020235e..f253852673059 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp @@ -2106,13 +2106,6 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr, return mlir::Value{}; } - if (builtinID == NEON::BI__builtin_neon_vfmaq_f16) { - cgm.errorNYI(expr->getSourceRange(), - std::string("unimplemented AArch64 builtin call: ") + - getContext().BuiltinInfo.getName(builtinID)); - return mlir::Value{}; - } - // Handle MSVC intrinsics before argument evaluation to prevent double // evaluation. assert(!cir::MissingFeatures::msvcBuiltins()); diff --git a/clang/test/CodeGen/AArch64/neon/vfmaq.c b/clang/test/CodeGen/AArch64/neon/fused-multiply.c similarity index 63% rename from clang/test/CodeGen/AArch64/neon/vfmaq.c rename to clang/test/CodeGen/AArch64/neon/fused-multiply.c index 1c05de703dace..e7957aecccf83 100644 --- a/clang/test/CodeGen/AArch64/neon/vfmaq.c +++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c @@ -1,8 +1,10 @@ // REQUIRES: aarch64-registered-target || arm-registered-target -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM -// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM %} -// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-cir -o - %s | FileCheck %s --check-prefixes=CIR %} +// RUN: %clang_cc1_cg_arm64_neon -target-feature +fullfp16 -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=ALL,LLVM +// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -target-feature +fullfp16 -fclangir -emit-llvm %s -disable-O0-optnone | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=ALL,LLVM %} +// RUN: %if cir-enabled %{%clang_cc1_cg_arm64_neon -target-feature +fullfp16 -fclangir -emit-cir %s -disable-O0-optnone | FileCheck %s --check-prefixes=ALL,CIR %} + +// ALL: {{[Mm]}}odule //============================================================================= // NOTES @@ -10,7 +12,7 @@ // This file contains tests that were originally located in: // * clang/test/CodeGen/AArch64/neon-intrinsics.c // The main difference is the use of RUN lines that enable ClangIR lowering. -// This file currently covers the f32/f64 wrappers that lower through +// This file currently covers the f16/f32/f64 wrappers that lower through // BI__builtin_neon_vfmaq_v. // // ACLE section headings based on v2025Q2 of the ACLE specification: @@ -25,6 +27,26 @@ //===------------------------------------------------------===// +// LLVM-LABEL: @test_vfmaq_f16( +// CIR-LABEL: @vfmaq_f16( +float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { +// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>) -> !cir.vector<8 x !cir.f16> + +// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <8 x half> {{.*}} [[C:%.*]]) {{.*}} { +// LLVM: [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8> +// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8> +// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8> +// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <8 x half> +// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <8 x half> +// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half> +// LLVM-NEXT: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B_CAST]], <8 x half> [[C_CAST]], <8 x half> [[A_CAST]]) +// LLVM-NEXT: ret <8 x half> [[FMA]] + return vfmaq_f16(a, b, c); +} + // LLVM-LABEL: @test_vfmaq_f32( // CIR-LABEL: @vfmaq_f32( float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c index b8380bd8ed6d4..ff1c206fc6350 100644 --- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c @@ -1621,25 +1621,6 @@ float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfma_f16(a, b, c); } -// CHECK-LABEL: define {{[^@]+}}@test_vfmaq_f16 -// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> -// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], <8 x half> [[TMP6]]) -// CHECK-NEXT: ret <8 x half> [[TMP9]] -// -float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { - return vfmaq_f16(a, b, c); -} - // CHECK-LABEL: define {{[^@]+}}@test_vfms_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
