https://github.com/yairbenavraham updated https://github.com/llvm/llvm-project/pull/204819
>From 217f9a33e64875d3a14d63cca921440a1e69157b Mon Sep 17 00:00:00 2001 From: Yair Ben Avraham <[email protected]> Date: Tue, 16 Jun 2026 14:25:51 +0300 Subject: [PATCH 1/4] [CIR][AArch64] Lower vfma_v builtin Lower BI__builtin_neon_vfma_v through the existing vfmaq_v FMA path. Add f16, f32, and f64 coverage for the non-quad vfma_* ACLE wrappers. --- .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 6 ++--- .../AArch64/neon/fused-multiple-fullfp16.c | 11 ++++++++++ .../CodeGen/AArch64/neon/fused-multiply.c | 22 +++++++++++++++++++ 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp index b52e978fa0f4d..e57abe24a6c3f 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp @@ -837,15 +837,15 @@ static mlir::Value emitCommonNeonBuiltinExpr( case NEON::BI__builtin_neon_vcvtx_f32_v: case NEON::BI__builtin_neon_vext_v: case NEON::BI__builtin_neon_vextq_v: - case NEON::BI__builtin_neon_vfma_v: cgf.cgm.errorNYI(expr->getSourceRange(), std::string("unimplemented AArch64 builtin call: ") + ctx.BuiltinInfo.getName(builtinID)); return mlir::Value{}; + case NEON::BI__builtin_neon_vfma_v: case NEON::BI__builtin_neon_vfmaq_v: { - // NEON intrinsic: vfmaq(accumulator, multiplicand1, multiplicand2) + // NEON intrinsic: vfma(q)(accumulator, multiplicand1, multiplicand2) // LLVM intrinsic: fma(multiplicand1, multiplicand2, accumulator) - // Reorder arguments to match LLVM fma signature + // Reorder arguments to match LLVM fma signature. mlir::Value op0 = cgf.getBuilder().createBitcast(ops[0], ty); mlir::Value op1 = cgf.getBuilder().createBitcast(ops[1], ty); mlir::Value op2 = cgf.getBuilder().createBitcast(ops[2], ty); diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c index 1460fb3b2bae1..25c7214d68ab7 100644 --- a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c +++ b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c @@ -27,6 +27,17 @@ // 2.6.1.9.3 Fused multiply-accumulate, vector quad forms //===------------------------------------------------------===// +// LLVM-LABEL: @test_vfma_f16( +// CIR-LABEL: @vfma_f16( +float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) { +// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<4 x !cir.f16>, !cir.vector<4 x !cir.f16>, !cir.vector<4 x !cir.f16>) -> !cir.vector<4 x !cir.f16> + +// LLVM-SAME: <4 x half> {{.*}} [[A:%.*]], <4 x half> {{.*}} [[B:%.*]], <4 x half> {{.*}} [[C:%.*]]) {{.*}} { +// LLVM: [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B_CAST:%.*]], <4 x half> [[C_CAST:%.*]], <4 x half> [[A_CAST:%.*]]) +// LLVM: ret <4 x half> [[FMA]] + return vfma_f16(a, b, c); +} + // LLVM-LABEL: @test_vfmaq_f16( // CIR-LABEL: @vfmaq_f16( float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c b/clang/test/CodeGen/AArch64/neon/fused-multiply.c index 4e30b063064aa..06be15486463e 100644 --- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c +++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c @@ -29,6 +29,28 @@ // 2.1.1.2.5 Fused multiply-accumulate, vector quad forms //===------------------------------------------------------===// +// LLVM-LABEL: @test_vfma_f32( +// CIR-LABEL: @vfma_f32( +float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) { +// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>) -> !cir.vector<2 x !cir.float> + +// LLVM-SAME: <2 x float> {{.*}} [[A:%.*]], <2 x float> {{.*}} [[B:%.*]], <2 x float> {{.*}} [[C:%.*]]) {{.*}} { +// LLVM: [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B_CAST:%.*]], <2 x float> [[C_CAST:%.*]], <2 x float> [[A_CAST:%.*]]) +// LLVM: ret <2 x float> [[FMA]] + return vfma_f32(a, b, c); +} + +// LLVM-LABEL: @test_vfma_f64( +// CIR-LABEL: @vfma_f64( +float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, float64x1_t c) { +// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<1 x !cir.double>, !cir.vector<1 x !cir.double>, !cir.vector<1 x !cir.double>) -> !cir.vector<1 x !cir.double> + +// LLVM-SAME: <1 x double> {{.*}} [[A:%.*]], <1 x double> {{.*}} [[B:%.*]], <1 x double> {{.*}} [[C:%.*]]) {{.*}} { +// LLVM: [[FMA:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B_CAST:%.*]], <1 x double> [[C_CAST:%.*]], <1 x double> [[A_CAST:%.*]]) +// LLVM: ret <1 x double> [[FMA]] + return vfma_f64(a, b, c); +} + // LLVM-LABEL: @test_vfmaq_f32( // CIR-LABEL: @vfmaq_f32( float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { >From 8721541215510dd45fcd4ded314e7ad42d4cd45d Mon Sep 17 00:00:00 2001 From: Yair Ben Avraham <[email protected]> Date: Tue, 16 Jun 2026 14:40:53 +0300 Subject: [PATCH 2/4] [CIR][AArch64] Lower vfma_lane_v builtin Lower BI__builtin_neon_vfma_lane_v through the existing vfmaq_lane_v path. Add f16, f32, and f64 coverage for the non-quad vfma_lane_* ACLE wrappers. --- .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 15 +++-- .../AArch64/neon/fused-multiple-fullfp16.c | 57 ++++++++++++------- .../CodeGen/AArch64/neon/fused-multiply.c | 24 ++++++++ 3 files changed, 66 insertions(+), 30 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp index e57abe24a6c3f..b0da33d271f6e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp @@ -2744,17 +2744,16 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr, return builder.createBitcast(ops[0], ty); } case NEON::BI__builtin_neon_vfma_lane_v: - cgm.errorNYI(expr->getSourceRange(), - std::string("unimplemented AArch64 builtin call: ") + - getContext().BuiltinInfo.getName(builtinID)); - return mlir::Value{}; case NEON::BI__builtin_neon_vfmaq_lane_v: { mlir::Value addend = builder.createBitcast(ops[0], ty); mlir::Value multiplicand = builder.createBitcast(ops[1], ty); - // The lane source operand is the non-quad vector, so it has half as many - // lanes as the quad result vector. - cir::VectorType sourceTy = - cir::VectorType::get(ty.getElementType(), ty.getSize() / 2); + // For vfmaq_lane, the lane source operand is the non-quad vector, so it has + // half as many lanes as the quad result vector. For vfma_lane, it has the + // same shape as the result vector. + cir::VectorType sourceTy = cir::VectorType::get( + ty.getElementType(), builtinID == NEON::BI__builtin_neon_vfmaq_lane_v + ? ty.getSize() / 2 + : ty.getSize()); mlir::Value laneSource = builder.createBitcast(ops[2], sourceTy); laneSource = emitNeonSplat(builder, loc, laneSource, ops[3], ty.getSize()); diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c index 25c7214d68ab7..30a0eb148e1b1 100644 --- a/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c +++ b/clang/test/CodeGen/AArch64/neon/fused-multiple-fullfp16.c @@ -58,6 +58,19 @@ float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmaq_f16(a, b, c); } +// ALL-LABEL: @test_vfma_lane_f16( +float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, + float16x4_t c) { +// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.f16>) [#cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !cir.f16> +// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : (!cir.vector<4 x !cir.f16>, !cir.vector<4 x !cir.f16>, !cir.vector<4 x !cir.f16>) -> !cir.vector<4 x !cir.f16> + +// LLVM-SAME: <4 x half> {{.*}} [[A:%.*]], <4 x half> {{.*}} [[B:%.*]], <4 x half> {{.*}} [[C:%.*]]) {{.*}} { +// LLVM: [[LANE:%.*]] = shufflevector <4 x half> {{.*}}, <4 x half> {{.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3> +// LLVM: [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B_CAST:%.*]], <4 x half> [[LANE]], <4 x half> [[A_CAST:%.*]]) +// LLVM: ret <4 x half> [[FMA]] + return vfma_lane_f16(a, b, c, 3); +} + // ALL-LABEL: @test_vfmaq_lane_f16( float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) { @@ -78,28 +91,6 @@ float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, return vfmaq_lane_f16(a, b, c, 3); } -// ALL-LABEL: @test_vfmaq_laneq_f16( -float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, - float16x8_t c) { -// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.f16>) [#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.f16> -// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>) -> !cir.vector<8 x !cir.f16> - -// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <8 x half> {{.*}} [[C:%.*]]) {{.*}} { -// LLVM: [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> -// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> -// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> -// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8> -// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8> -// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8> -// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <8 x half> -// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <8 x half> -// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half> -// LLVM-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C_CAST]], <8 x half> {{.*}}, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> -// LLVM-NEXT: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[B_CAST]], <8 x half> [[A_CAST]]) -// LLVM: ret <8 x half> [[FMA]] - return vfmaq_laneq_f16(a, b, c, 7); -} - // ALL-LABEL: @test_vfma_laneq_f16( float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) { @@ -121,3 +112,25 @@ float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, // LLVM: ret <4 x half> [[FMA]] return vfma_laneq_f16(a, b, c, 7); } + +// ALL-LABEL: @test_vfmaq_laneq_f16( +float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, + float16x8_t c) { +// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !cir.f16>) [#cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i, #cir.int<7> : !s32i] : !cir.vector<8 x !cir.f16> +// CIR: cir.call_llvm_intrinsic "fma" [[LANE]], %{{.*}}, %{{.*}} : (!cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>, !cir.vector<8 x !cir.f16>) -> !cir.vector<8 x !cir.f16> + +// LLVM-SAME: <8 x half> {{.*}} [[A:%.*]], <8 x half> {{.*}} [[B:%.*]], <8 x half> {{.*}} [[C:%.*]]) {{.*}} { +// LLVM: [[A_I:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// LLVM-NEXT: [[B_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// LLVM-NEXT: [[C_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// LLVM-NEXT: [[A_BYTES:%.*]] = bitcast <8 x i16> [[A_I]] to <16 x i8> +// LLVM-NEXT: [[B_BYTES:%.*]] = bitcast <8 x i16> [[B_I]] to <16 x i8> +// LLVM-NEXT: [[C_BYTES:%.*]] = bitcast <8 x i16> [[C_I]] to <16 x i8> +// LLVM-NEXT: [[A_CAST:%.*]] = bitcast <16 x i8> [[A_BYTES]] to <8 x half> +// LLVM-NEXT: [[B_CAST:%.*]] = bitcast <16 x i8> [[B_BYTES]] to <8 x half> +// LLVM-NEXT: [[C_CAST:%.*]] = bitcast <16 x i8> [[C_BYTES]] to <8 x half> +// LLVM-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C_CAST]], <8 x half> {{.*}}, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> +// LLVM-NEXT: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[B_CAST]], <8 x half> [[A_CAST]]) +// LLVM: ret <8 x half> [[FMA]] + return vfmaq_laneq_f16(a, b, c, 7); +} diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c b/clang/test/CodeGen/AArch64/neon/fused-multiply.c index 06be15486463e..8347038cea0db 100644 --- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c +++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c @@ -91,6 +91,30 @@ float64x2_t test_vfmaq_f64(float64x2_t a, float64x2_t b, float64x2_t c) { return vfmaq_f64(a, b, c); } +// ALL-LABEL: @test_vfma_lane_f32( +float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { +// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.float>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x !cir.float> +// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : (!cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>, !cir.vector<2 x !cir.float>) -> !cir.vector<2 x !cir.float> + +// LLVM-SAME: <2 x float> {{.*}} [[A:%.*]], <2 x float> {{.*}} [[B:%.*]], <2 x float> {{.*}} [[V:%.*]]) {{.*}} { +// LLVM: [[LANE:%.*]] = shufflevector <2 x float> {{.*}}, <2 x float> {{.*}}, <2 x i32> <i32 1, i32 1> +// LLVM: [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B_CAST:%.*]], <2 x float> [[LANE]], <2 x float> [[A_CAST:%.*]]) +// LLVM: ret <2 x float> [[FMA]] + return vfma_lane_f32(a, b, v, 1); +} + +// ALL-LABEL: @test_vfma_lane_f64( +float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) { +// CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<1 x !cir.double>) [#cir.int<0> : !s32i] : !cir.vector<1 x !cir.double> +// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : (!cir.vector<1 x !cir.double>, !cir.vector<1 x !cir.double>, !cir.vector<1 x !cir.double>) -> !cir.vector<1 x !cir.double> + +// LLVM-SAME: <1 x double> {{.*}} [[A:%.*]], <1 x double> {{.*}} [[B:%.*]], <1 x double> {{.*}} [[V:%.*]]) {{.*}} { +// LLVM: [[LANE:%.*]] = shufflevector <1 x double> {{.*}}, <1 x double> {{.*}}, <1 x i32> zeroinitializer +// LLVM: [[FMA:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B_CAST:%.*]], <1 x double> [[LANE]], <1 x double> [[A_CAST:%.*]]) +// LLVM: ret <1 x double> [[FMA]] + return vfma_lane_f64(a, b, v, 0); +} + // ALL-LABEL: @test_vfmaq_lane_f32( float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { // CIR: [[LANE:%.*]] = cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.float>) [#cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i, #cir.int<1> : !s32i] : !cir.vector<4 x !cir.float> >From 9b542873443a691dfa2cb589b03dc5eb780d9bcb Mon Sep 17 00:00:00 2001 From: Yair Ben Avraham <[email protected]> Date: Tue, 16 Jun 2026 15:08:57 +0300 Subject: [PATCH 3/4] [CIR][AArch64] Lower vfmas lane builtins Lower BI__builtin_neon_vfmas_lane_f32 and BI__builtin_neon_vfmas_laneq_f32 by extracting the selected lane and emitting llvm.fma. Add scalar f32 lane and laneq coverage. --- .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 28 +++++++++++++++++-- .../CodeGen/AArch64/neon/fused-multiply.c | 24 ++++++++++++++++ 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp index b0da33d271f6e..8be4d567cefc5 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp @@ -2803,9 +2803,33 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr, return emitCallMaybeConstrainedBuiltin(builder, loc, "fma", ty, fmaOps); } case NEON::BI__builtin_neon_vfmah_lane_f16: - case NEON::BI__builtin_neon_vfmas_lane_f32: + cgm.errorNYI(expr->getSourceRange(), + std::string("unimplemented AArch64 builtin call: ") + + getContext().BuiltinInfo.getName(builtinID)); + return mlir::Value{}; + case NEON::BI__builtin_neon_vfmas_lane_f32: { + // Scalar lane/laneq forms use one selected element from the lane source. + mlir::Value laneSource = builder.createExtractElement( + loc, ops[2], static_cast<uint64_t>(getIntValueFromConstOp(ops[3]))); + + llvm::SmallVector<mlir::Value> fmaOps = {ops[1], laneSource, ops[0]}; + return emitCallMaybeConstrainedBuiltin( + builder, loc, "fma", convertType(expr->getType()), fmaOps); + } case NEON::BI__builtin_neon_vfmah_laneq_f16: - case NEON::BI__builtin_neon_vfmas_laneq_f32: + cgm.errorNYI(expr->getSourceRange(), + std::string("unimplemented AArch64 builtin call: ") + + getContext().BuiltinInfo.getName(builtinID)); + return mlir::Value{}; + case NEON::BI__builtin_neon_vfmas_laneq_f32: { + // Scalar lane/laneq forms use one selected element from the lane source. + mlir::Value laneSource = builder.createExtractElement( + loc, ops[2], static_cast<uint64_t>(getIntValueFromConstOp(ops[3]))); + + llvm::SmallVector<mlir::Value> fmaOps = {ops[1], laneSource, ops[0]}; + return emitCallMaybeConstrainedBuiltin( + builder, loc, "fma", convertType(expr->getType()), fmaOps); + } case NEON::BI__builtin_neon_vfmad_lane_f64: cgm.errorNYI(expr->getSourceRange(), std::string("unimplemented AArch64 builtin call: ") + diff --git a/clang/test/CodeGen/AArch64/neon/fused-multiply.c b/clang/test/CodeGen/AArch64/neon/fused-multiply.c index 8347038cea0db..8938958c05f38 100644 --- a/clang/test/CodeGen/AArch64/neon/fused-multiply.c +++ b/clang/test/CodeGen/AArch64/neon/fused-multiply.c @@ -310,6 +310,30 @@ float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, return vfmaq_laneq_f64(a, b, v, 0); } +// ALL-LABEL: @test_vfmas_lane_f32( +float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) { +// CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : !cir.vector<2 x !cir.float> +// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : (!cir.float, !cir.float, !cir.float) -> !cir.float + +// LLVM-SAME: float {{.*}} [[A:%.*]], float {{.*}} [[B:%.*]], <2 x float> {{.*}} [[C:%.*]]) {{.*}} { +// LLVM: [[LANE:%.*]] = extractelement <2 x float> [[C]], i{{32|64}} 1 +// LLVM: [[FMA:%.*]] = call float @llvm.fma.f32(float [[B]], float [[LANE]], float [[A]]) +// LLVM: ret float [[FMA]] + return vfmas_lane_f32(a, b, c, 1); +} + +// ALL-LABEL: @test_vfmas_laneq_f32( +float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t c) { +// CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : !cir.vector<4 x !cir.float> +// CIR: cir.call_llvm_intrinsic "fma" %{{.*}}, [[LANE]], %{{.*}} : (!cir.float, !cir.float, !cir.float) -> !cir.float + +// LLVM-SAME: float {{.*}} [[A:%.*]], float {{.*}} [[B:%.*]], <4 x float> {{.*}} [[C:%.*]]) {{.*}} { +// LLVM: [[LANE:%.*]] = extractelement <4 x float> [[C]], i{{32|64}} 3 +// LLVM: [[FMA:%.*]] = call float @llvm.fma.f32(float [[B]], float [[LANE]], float [[A]]) +// LLVM: ret float [[FMA]] + return vfmas_laneq_f32(a, b, c, 3); +} + // ALL-LABEL: @test_vfmad_laneq_f64( float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) { // CIR: [[LANE:%.*]] = cir.vec.extract %{{.*}}[%{{.*}} : !u64i] : !cir.vector<2 x !cir.double> >From ca92306495de010bda3807f592027a1b5ba5d180 Mon Sep 17 00:00:00 2001 From: Yair Ben Avraham <[email protected]> Date: Fri, 19 Jun 2026 16:33:03 +0300 Subject: [PATCH 4/4] [CIR][AArch64] Remove obsolete BF16 aliases Opaque BF16 splat, load, and store builtins now share generic NEON builtin IDs. Remove obsolete CIR alias entries to match classic CodeGen. --- .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp | 76 ------------------- 1 file changed, 76 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp index 8be4d567cefc5..927e85e2edc48 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp @@ -1474,22 +1474,6 @@ CIRGenFunction::emitAArch64SMEBuiltinExpr(unsigned builtinID, // Some intrinsics are equivalent for codegen. static const std::pair<unsigned, unsigned> neonEquivalentIntrinsicMap[] = { - { - NEON::BI__builtin_neon_splat_lane_bf16, - NEON::BI__builtin_neon_splat_lane_v, - }, - { - NEON::BI__builtin_neon_splat_laneq_bf16, - NEON::BI__builtin_neon_splat_laneq_v, - }, - { - NEON::BI__builtin_neon_splatq_lane_bf16, - NEON::BI__builtin_neon_splatq_lane_v, - }, - { - NEON::BI__builtin_neon_splatq_laneq_bf16, - NEON::BI__builtin_neon_splatq_laneq_v, - }, { NEON::BI__builtin_neon_vabd_f16, NEON::BI__builtin_neon_vabd_v, @@ -1602,40 +1586,6 @@ static const std::pair<unsigned, unsigned> neonEquivalentIntrinsicMap[] = { NEON::BI__builtin_neon_vfmaq_laneq_f16, NEON::BI__builtin_neon_vfmaq_laneq_v, }, - {NEON::BI__builtin_neon_vld1_bf16_x2, NEON::BI__builtin_neon_vld1_x2_v}, - {NEON::BI__builtin_neon_vld1_bf16_x3, NEON::BI__builtin_neon_vld1_x3_v}, - {NEON::BI__builtin_neon_vld1_bf16_x4, NEON::BI__builtin_neon_vld1_x4_v}, - {NEON::BI__builtin_neon_vld1_bf16, NEON::BI__builtin_neon_vld1_v}, - {NEON::BI__builtin_neon_vld1_dup_bf16, NEON::BI__builtin_neon_vld1_dup_v}, - {NEON::BI__builtin_neon_vld1_lane_bf16, NEON::BI__builtin_neon_vld1_lane_v}, - {NEON::BI__builtin_neon_vld1q_bf16_x2, NEON::BI__builtin_neon_vld1q_x2_v}, - {NEON::BI__builtin_neon_vld1q_bf16_x3, NEON::BI__builtin_neon_vld1q_x3_v}, - {NEON::BI__builtin_neon_vld1q_bf16_x4, NEON::BI__builtin_neon_vld1q_x4_v}, - {NEON::BI__builtin_neon_vld1q_bf16, NEON::BI__builtin_neon_vld1q_v}, - {NEON::BI__builtin_neon_vld1q_dup_bf16, NEON::BI__builtin_neon_vld1q_dup_v}, - {NEON::BI__builtin_neon_vld1q_lane_bf16, - NEON::BI__builtin_neon_vld1q_lane_v}, - {NEON::BI__builtin_neon_vld2_bf16, NEON::BI__builtin_neon_vld2_v}, - {NEON::BI__builtin_neon_vld2_dup_bf16, NEON::BI__builtin_neon_vld2_dup_v}, - {NEON::BI__builtin_neon_vld2_lane_bf16, NEON::BI__builtin_neon_vld2_lane_v}, - {NEON::BI__builtin_neon_vld2q_bf16, NEON::BI__builtin_neon_vld2q_v}, - {NEON::BI__builtin_neon_vld2q_dup_bf16, NEON::BI__builtin_neon_vld2q_dup_v}, - {NEON::BI__builtin_neon_vld2q_lane_bf16, - NEON::BI__builtin_neon_vld2q_lane_v}, - {NEON::BI__builtin_neon_vld3_bf16, NEON::BI__builtin_neon_vld3_v}, - {NEON::BI__builtin_neon_vld3_dup_bf16, NEON::BI__builtin_neon_vld3_dup_v}, - {NEON::BI__builtin_neon_vld3_lane_bf16, NEON::BI__builtin_neon_vld3_lane_v}, - {NEON::BI__builtin_neon_vld3q_bf16, NEON::BI__builtin_neon_vld3q_v}, - {NEON::BI__builtin_neon_vld3q_dup_bf16, NEON::BI__builtin_neon_vld3q_dup_v}, - {NEON::BI__builtin_neon_vld3q_lane_bf16, - NEON::BI__builtin_neon_vld3q_lane_v}, - {NEON::BI__builtin_neon_vld4_bf16, NEON::BI__builtin_neon_vld4_v}, - {NEON::BI__builtin_neon_vld4_dup_bf16, NEON::BI__builtin_neon_vld4_dup_v}, - {NEON::BI__builtin_neon_vld4_lane_bf16, NEON::BI__builtin_neon_vld4_lane_v}, - {NEON::BI__builtin_neon_vld4q_bf16, NEON::BI__builtin_neon_vld4q_v}, - {NEON::BI__builtin_neon_vld4q_dup_bf16, NEON::BI__builtin_neon_vld4q_dup_v}, - {NEON::BI__builtin_neon_vld4q_lane_bf16, - NEON::BI__builtin_neon_vld4q_lane_v}, { NEON::BI__builtin_neon_vmax_f16, NEON::BI__builtin_neon_vmax_v, @@ -1812,32 +1762,6 @@ static const std::pair<unsigned, unsigned> neonEquivalentIntrinsicMap[] = { NEON::BI__builtin_neon_vsqrtq_f16, NEON::BI__builtin_neon_vsqrtq_v, }, - {NEON::BI__builtin_neon_vst1_bf16_x2, NEON::BI__builtin_neon_vst1_x2_v}, - {NEON::BI__builtin_neon_vst1_bf16_x3, NEON::BI__builtin_neon_vst1_x3_v}, - {NEON::BI__builtin_neon_vst1_bf16_x4, NEON::BI__builtin_neon_vst1_x4_v}, - {NEON::BI__builtin_neon_vst1_bf16, NEON::BI__builtin_neon_vst1_v}, - {NEON::BI__builtin_neon_vst1_lane_bf16, NEON::BI__builtin_neon_vst1_lane_v}, - {NEON::BI__builtin_neon_vst1q_bf16_x2, NEON::BI__builtin_neon_vst1q_x2_v}, - {NEON::BI__builtin_neon_vst1q_bf16_x3, NEON::BI__builtin_neon_vst1q_x3_v}, - {NEON::BI__builtin_neon_vst1q_bf16_x4, NEON::BI__builtin_neon_vst1q_x4_v}, - {NEON::BI__builtin_neon_vst1q_bf16, NEON::BI__builtin_neon_vst1q_v}, - {NEON::BI__builtin_neon_vst1q_lane_bf16, - NEON::BI__builtin_neon_vst1q_lane_v}, - {NEON::BI__builtin_neon_vst2_bf16, NEON::BI__builtin_neon_vst2_v}, - {NEON::BI__builtin_neon_vst2_lane_bf16, NEON::BI__builtin_neon_vst2_lane_v}, - {NEON::BI__builtin_neon_vst2q_bf16, NEON::BI__builtin_neon_vst2q_v}, - {NEON::BI__builtin_neon_vst2q_lane_bf16, - NEON::BI__builtin_neon_vst2q_lane_v}, - {NEON::BI__builtin_neon_vst3_bf16, NEON::BI__builtin_neon_vst3_v}, - {NEON::BI__builtin_neon_vst3_lane_bf16, NEON::BI__builtin_neon_vst3_lane_v}, - {NEON::BI__builtin_neon_vst3q_bf16, NEON::BI__builtin_neon_vst3q_v}, - {NEON::BI__builtin_neon_vst3q_lane_bf16, - NEON::BI__builtin_neon_vst3q_lane_v}, - {NEON::BI__builtin_neon_vst4_bf16, NEON::BI__builtin_neon_vst4_v}, - {NEON::BI__builtin_neon_vst4_lane_bf16, NEON::BI__builtin_neon_vst4_lane_v}, - {NEON::BI__builtin_neon_vst4q_bf16, NEON::BI__builtin_neon_vst4q_v}, - {NEON::BI__builtin_neon_vst4q_lane_bf16, - NEON::BI__builtin_neon_vst4q_lane_v}, // The mangling rules cause us to have one ID for each type for // vldap1(q)_lane and vstl1(q)_lane, but codegen is equivalent for all of // them. Choose an arbitrary one to be handled as tha canonical variation. _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
