https://github.com/jthackray updated https://github.com/llvm/llvm-project/pull/165431
>From f75e04ef5b811affb3af340b4554f09174955d1b Mon Sep 17 00:00:00 2001 From: Jonathan Thackray <[email protected]> Date: Mon, 27 Oct 2025 16:23:50 +0000 Subject: [PATCH 1/5] [AArch64][llvm] Add support for vmmlaq_[f16,f32]_mf8 intrinsics Add support for the following new intrinsics: ``` float16x8_t vmmlaq_f16_mf8_fpm(float16x8_t, mfloat8x16_t, mfloat8x16_t, fpm_t); float32x4_t vmmlaq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t, fpm_t); ``` --- clang/include/clang/Basic/arm_neon.td | 8 ++++++ clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 8 ++++++ .../CodeGen/AArch64/v8.6a-neon-intrinsics.c | 27 ++++++++++++++++++- llvm/include/llvm/IR/IntrinsicsAArch64.td | 6 +++++ 4 files changed, 48 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index ef196103035e8..8e2174c880ed8 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -1896,6 +1896,14 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>; } +let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f16mm,neon" in { + def VMMLA_F16_MF8 : VInst<"vmmla_f16_mf8_fpm", "(>F)(>F)..V", "Qm">; +} + +let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f32mm,neon" in { + def VMMLA_F32_MF8 : VInst<"vmmla_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">; +} + let TargetGuard = "i8mm,neon" in { def VMMLA : SInst<"vmmla", "..(<<)(<<)", "QUiQi">; def VUSMMLA : SInst<"vusmmla", "..(<<U)(<<)", "Qi">; diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index 15fa78ddba715..d6b85b54f426f 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -7624,6 +7624,14 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2"); } + case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm: + return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla, + {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E, + "fmmla"); + case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm: + return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla, + {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E, + "fmmla"); case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm: ExtractLow = true; [[fallthrough]]; diff --git a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c index 6fffcb6c6b391..0d592af59f85c 100644 --- a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \ +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm -target-feature +f8f16mm -target-feature +f8f32mm -target-feature +fp8 \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ // RUN: | opt -S -passes=mem2reg,sroa \ // RUN: | FileCheck %s @@ -32,6 +32,31 @@ uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) { return vmmlaq_u32(r, a, b); } +// CHECK-LABEL: define dso_local <8 x half> @test_vmmlaq_f16_mf8( +// CHECK-SAME: <8 x half> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[P0]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]]) +// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[FMMLA1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16(<8 x half> [[FMMLA_I]], <16 x i8> [[P1]], <16 x i8> [[P2]]) +// CHECK-NEXT: ret <8 x half> [[FMMLA1_I]] +// +float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) { + return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8( +// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]]) +// CHECK-NEXT: [[FMMLA_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32(<4 x float> [[P0]], <16 x i8> [[P1]], <16 x i8> [[P2]]) +// CHECK-NEXT: ret <4 x float> [[FMMLA_I]] +// +float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) { + return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3); +} + // CHECK-LABEL: define dso_local <4 x i32> @test_vusmmlaq_s32( // CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index b81edc385cd43..78a60e839775e 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -217,6 +217,11 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty], [IntrNoMem]>; + + class AdvSIMD_MatMul_fpm_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty], + [IntrNoMem]>; } // Arithmetic ops @@ -499,6 +504,7 @@ let TargetPrefix = "aarch64" in { def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic; def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic; def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic; + def int_aarch64_neon_fmmla : AdvSIMD_MatMul_fpm_Intrinsic; def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic; def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic; def int_aarch64_neon_bfmmla >From 4b5fa3acfeee67c8478b4c56bc3e06cbe99587de Mon Sep 17 00:00:00 2001 From: Jonathan Thackray <[email protected]> Date: Wed, 29 Oct 2025 14:15:29 +0000 Subject: [PATCH 2/5] fixup! [AArch64][llvm] Add support for vmmlaq_[f16,f32]_mf8 intrinsics Fix CR comments; don't create a new intrinsic, and split test files --- clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 10 +++-- .../CodeGen/AArch64/v8.6a-neon-intrinsics.c | 27 +------------ .../CodeGen/AArch64/v9.6a-neon-intrinsics.c | 39 +++++++++++++++++++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 7 +--- 4 files changed, 47 insertions(+), 36 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/v9.6a-neon-intrinsics.c diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index d6b85b54f426f..24033116fc3fe 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -7626,12 +7626,14 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, } case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm: return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla, - {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E, - "fmmla"); + {llvm::FixedVectorType::get(HalfTy, 8), + llvm::FixedVectorType::get(HalfTy, 8)}, + Ops, E, "fmmla"); case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm: return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla, - {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E, - "fmmla"); + {llvm::FixedVectorType::get(FloatTy, 4), + llvm::FixedVectorType::get(FloatTy, 4)}, + Ops, E, "fmmla"); case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm: ExtractLow = true; [[fallthrough]]; diff --git a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c index 0d592af59f85c..6fffcb6c6b391 100644 --- a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm -target-feature +f8f16mm -target-feature +f8f32mm -target-feature +fp8 \ +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ // RUN: | opt -S -passes=mem2reg,sroa \ // RUN: | FileCheck %s @@ -32,31 +32,6 @@ uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) { return vmmlaq_u32(r, a, b); } -// CHECK-LABEL: define dso_local <8 x half> @test_vmmlaq_f16_mf8( -// CHECK-SAME: <8 x half> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[P0]] to <8 x i16> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> -// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]]) -// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK-NEXT: [[FMMLA1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16(<8 x half> [[FMMLA_I]], <16 x i8> [[P1]], <16 x i8> [[P2]]) -// CHECK-NEXT: ret <8 x half> [[FMMLA1_I]] -// -float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) { - return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3); -} - -// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8( -// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]]) -// CHECK-NEXT: [[FMMLA_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32(<4 x float> [[P0]], <16 x i8> [[P1]], <16 x i8> [[P2]]) -// CHECK-NEXT: ret <4 x float> [[FMMLA_I]] -// -float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) { - return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3); -} - // CHECK-LABEL: define dso_local <4 x i32> @test_vusmmlaq_s32( // CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] diff --git a/clang/test/CodeGen/AArch64/v9.6a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v9.6a-neon-intrinsics.c new file mode 100644 index 0000000000000..b88a22701495f --- /dev/null +++ b/clang/test/CodeGen/AArch64/v9.6a-neon-intrinsics.c @@ -0,0 +1,39 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v9.6a -target-feature +f8f16mm -target-feature +f8f32mm -target-feature +fp8 \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s \ +// RUN: | opt -S -passes=mem2reg,sroa \ +// RUN: | FileCheck %s + +// REQUIRES: aarch64-registered-target + +#include <arm_neon.h> + +// CHECK-LABEL: define dso_local <8 x half> @test_vmmlaq_f16_mf8( +// CHECK-SAME: <8 x half> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[P0]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]]) +// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[FMMLA1_I:%.*]] = bitcast <16 x i8> [[P1]] to <8 x half> +// CHECK-NEXT: [[FMMLA2_I:%.*]] = bitcast <16 x i8> [[P2]] to <8 x half> +// CHECK-NEXT: [[FMMLA3_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v8f16(<8 x half> [[FMMLA_I]], <8 x half> [[FMMLA1_I]], <8 x half> [[FMMLA2_I]]) +// CHECK-NEXT: ret <8 x half> [[FMMLA3_I]] +// +float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) { + return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8( +// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]]) +// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[P1]] to <4 x float> +// CHECK-NEXT: [[FMMLA1_I:%.*]] = bitcast <16 x i8> [[P2]] to <4 x float> +// CHECK-NEXT: [[FMMLA2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v4f32(<4 x float> [[P0]], <4 x float> [[FMMLA_I]], <4 x float> [[FMMLA1_I]]) +// CHECK-NEXT: ret <4 x float> [[FMMLA2_I]] +// +float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) { + return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3); +} + diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 78a60e839775e..4cab6e05ba79f 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -217,11 +217,6 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty], [IntrNoMem]>; - - class AdvSIMD_MatMul_fpm_Intrinsic - : DefaultAttrsIntrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty], - [IntrNoMem]>; } // Arithmetic ops @@ -504,7 +499,7 @@ let TargetPrefix = "aarch64" in { def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic; def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic; def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic; - def int_aarch64_neon_fmmla : AdvSIMD_MatMul_fpm_Intrinsic; + def int_aarch64_neon_fmmla : AdvSIMD_MatMul_Intrinsic; def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic; def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic; def int_aarch64_neon_bfmmla >From 846648d00e47b8aec54d1cb1d2653ea8651bb8ab Mon Sep 17 00:00:00 2001 From: Jonathan Thackray <[email protected]> Date: Wed, 29 Oct 2025 15:00:03 +0000 Subject: [PATCH 3/5] fixup! [AArch64][llvm] Add support for vmmlaq_[f16,f32]_mf8 intrinsics Split testcase files --- ...trinsics.c => v9.6a-neon-f16-intrinsics.c} | 16 +------------ .../AArch64/v9.6a-neon-f32-intrinsics.c | 23 +++++++++++++++++++ 2 files changed, 24 insertions(+), 15 deletions(-) rename clang/test/CodeGen/AArch64/{v9.6a-neon-intrinsics.c => v9.6a-neon-f16-intrinsics.c} (63%) create mode 100644 clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c diff --git a/clang/test/CodeGen/AArch64/v9.6a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c similarity index 63% rename from clang/test/CodeGen/AArch64/v9.6a-neon-intrinsics.c rename to clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c index b88a22701495f..593975a9b1a4f 100644 --- a/clang/test/CodeGen/AArch64/v9.6a-neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v9.6a -target-feature +f8f16mm -target-feature +f8f32mm -target-feature +fp8 \ +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v9.6a -target-feature +f8f16mm -target-feature +fp8 \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ // RUN: | opt -S -passes=mem2reg,sroa \ // RUN: | FileCheck %s @@ -23,17 +23,3 @@ float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) { return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3); } - -// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8( -// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]]) -// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[P1]] to <4 x float> -// CHECK-NEXT: [[FMMLA1_I:%.*]] = bitcast <16 x i8> [[P2]] to <4 x float> -// CHECK-NEXT: [[FMMLA2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v4f32(<4 x float> [[P0]], <4 x float> [[FMMLA_I]], <4 x float> [[FMMLA1_I]]) -// CHECK-NEXT: ret <4 x float> [[FMMLA2_I]] -// -float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) { - return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3); -} - diff --git a/clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c b/clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c new file mode 100644 index 0000000000000..08ef002d18769 --- /dev/null +++ b/clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c @@ -0,0 +1,23 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v9.6a -target-feature +f8f32mm -target-feature +fp8 \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s \ +// RUN: | opt -S -passes=mem2reg,sroa \ +// RUN: | FileCheck %s + +// REQUIRES: aarch64-registered-target + +#include <arm_neon.h> + +// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8( +// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]]) +// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[P1]] to <4 x float> +// CHECK-NEXT: [[FMMLA1_I:%.*]] = bitcast <16 x i8> [[P2]] to <4 x float> +// CHECK-NEXT: [[FMMLA2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v4f32(<4 x float> [[P0]], <4 x float> [[FMMLA_I]], <4 x float> [[FMMLA1_I]]) +// CHECK-NEXT: ret <4 x float> [[FMMLA2_I]] +// +float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) { + return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3); +} + >From 533acb260d7d9ec3b5747c91a4fc32d68f086981 Mon Sep 17 00:00:00 2001 From: Jonathan Thackray <[email protected]> Date: Thu, 6 Nov 2025 15:02:14 +0000 Subject: [PATCH 4/5] fixup! [AArch64][llvm] Add support for vmmlaq_[f16,f32]_mf8 intrinsics Add extra lowering --- clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 4 ++-- .../AArch64/v9.6a-neon-f16-intrinsics.c | 6 ++---- .../AArch64/v9.6a-neon-f32-intrinsics.c | 6 ++---- .../lib/Target/AArch64/AArch64InstrFormats.td | 20 +++++++++++++++++++ llvm/lib/Target/AArch64/AArch64InstrInfo.td | 11 ++++++++++ .../CodeGen/AArch64/aarch64-matmul-fp16.ll | 14 +++++++++++++ .../CodeGen/AArch64/aarch64-matmul-fp32.ll | 13 ++++++++++++ 7 files changed, 64 insertions(+), 10 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll create mode 100644 llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index 24033116fc3fe..5421d46377e66 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -7627,12 +7627,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm: return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla, {llvm::FixedVectorType::get(HalfTy, 8), - llvm::FixedVectorType::get(HalfTy, 8)}, + llvm::FixedVectorType::get(Int8Ty, 16)}, Ops, E, "fmmla"); case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm: return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla, {llvm::FixedVectorType::get(FloatTy, 4), - llvm::FixedVectorType::get(FloatTy, 4)}, + llvm::FixedVectorType::get(Int8Ty, 16)}, Ops, E, "fmmla"); case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm: ExtractLow = true; diff --git a/clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c b/clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c index 593975a9b1a4f..89ee9e38bb3fb 100644 --- a/clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c +++ b/clang/test/CodeGen/AArch64/v9.6a-neon-f16-intrinsics.c @@ -15,10 +15,8 @@ // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> // CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]]) // CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK-NEXT: [[FMMLA1_I:%.*]] = bitcast <16 x i8> [[P1]] to <8 x half> -// CHECK-NEXT: [[FMMLA2_I:%.*]] = bitcast <16 x i8> [[P2]] to <8 x half> -// CHECK-NEXT: [[FMMLA3_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v8f16(<8 x half> [[FMMLA_I]], <8 x half> [[FMMLA1_I]], <8 x half> [[FMMLA2_I]]) -// CHECK-NEXT: ret <8 x half> [[FMMLA3_I]] +// CHECK-NEXT: [[FMMLA1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v16i8(<8 x half> [[FMMLA_I]], <16 x i8> [[P1]], <16 x i8> [[P2]]) +// CHECK-NEXT: ret <8 x half> [[FMMLA1_I]] // float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) { return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3); diff --git a/clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c b/clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c index 08ef002d18769..13db72c2cbdd1 100644 --- a/clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c +++ b/clang/test/CodeGen/AArch64/v9.6a-neon-f32-intrinsics.c @@ -12,10 +12,8 @@ // CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]]) -// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[P1]] to <4 x float> -// CHECK-NEXT: [[FMMLA1_I:%.*]] = bitcast <16 x i8> [[P2]] to <4 x float> -// CHECK-NEXT: [[FMMLA2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v4f32(<4 x float> [[P0]], <4 x float> [[FMMLA_I]], <4 x float> [[FMMLA1_I]]) -// CHECK-NEXT: ret <4 x float> [[FMMLA2_I]] +// CHECK-NEXT: [[FMMLA_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v16i8(<4 x float> [[P0]], <16 x i8> [[P1]], <16 x i8> [[P2]]) +// CHECK-NEXT: ret <4 x float> [[FMMLA_I]] // float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) { return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3); diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 58a53af76e1b5..1957b2b77a920 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -9111,6 +9111,26 @@ class SIMDThreeSameVectorMatMul<bit B, bit U, string asm, SDPatternOperator OpNo let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b|.4s\t$Rd, $Rn, $Rm}"; } +multiclass SIMDThreeSameVectorMatMulFP<bit B, bit U, string asm, SDPatternOperator OpNode> { + let Predicates = [HasNEON, HasF8F16MM] in { + def fp16 : BaseSIMDThreeSameVectorTied<1, U, 0b000, {0b1101, B}, V128, asm, ".8h", + [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { + let AsmString = asm # "{\t$Rd.4h, $Rn.16b, $Rm.16b|.8h\t$Rd, $Rn, $Rm}"; + } + } + + let Predicates = [HasNEON, HasF8F32MM] in { + def fp32 : BaseSIMDThreeSameVectorTied<1, U, 0b100, {0b1101, B}, V128, asm, ".4s", + [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { + let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b|.4s\t$Rd, $Rn, $Rm}"; + } + } +} + //---------------------------------------------------------------------------- // ARMv8.2-A Dot Product Instructions (Indexed) class BaseSIMDThreeSameVectorIndexS<bit Q, bit U, bits<2> size, bits<4> opc, string asm, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 2871a20e28b65..a9f29a97a6028 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1731,10 +1731,21 @@ def BFCVT : BF16ToSinglePrecision<"bfcvt">; let Predicates = [HasMatMulInt8] in { def SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>; def UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>; +defm FMMLA : SIMDThreeSameVectorMatMulFP<1, 1, "fmmla", int_aarch64_neon_fmmla>; def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>; defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", AArch64usdot>; defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", AArch64usdot>; +// FMMLA fp16 +def : Pat<(v8f16 (int_aarch64_neon_fmmla + (v8f16 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm))), + (FMMLAfp16 V128:$Rd, V128:$Rn, V128:$Rm)>; + +// FMMLA fp32 +def : Pat<(v4f32 (int_aarch64_neon_fmmla + (v4f32 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm))), + (FMMLAfp32 V128:$Rd, V128:$Rn, V128:$Rm)>; + // sudot lane has a pattern where usdot is expected (there is no sudot). // The second operand is used in the dup operation to repeat the indexed // element. diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll new file mode 100644 index 0000000000000..79852f2079eca --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll @@ -0,0 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f16mm < %s | FileCheck %s +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f16mm -global-isel < %s | FileCheck %s + +define <8 x half> @fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: fmmla.v8f16.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmmla v0.4h, v1.16b, v2.16b +; CHECK-NEXT: ret +entry: + %vfmmla1.i = tail call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) #3 + ret <8 x half> %vfmmla1.i +} + diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll new file mode 100644 index 0000000000000..4c33567732687 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f32mm < %s | FileCheck %s +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f32mm -global-isel < %s | FileCheck %s + +define <4 x float> @fmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: fmmla.v4f32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmmla v0.4s, v1.16b, v2.16b +; CHECK-NEXT: ret +entry: + %vfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %a, <16 x i8> %b) #3 + ret <4 x float> %vfmmla1.i +} >From 0d8078f63f699d2fcbed670a0c913e1cd63417e1 Mon Sep 17 00:00:00 2001 From: Jonathan Thackray <[email protected]> Date: Thu, 6 Nov 2025 20:44:06 +0000 Subject: [PATCH 5/5] fixup! [AArch64][llvm] Add support for vmmlaq_[f16,f32]_mf8 intrinsics Make it work properly --- .../lib/Target/AArch64/AArch64InstrFormats.td | 36 ++++++------------- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 12 +------ .../CodeGen/AArch64/aarch64-matmul-fp16.ll | 2 +- 3 files changed, 13 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 1957b2b77a920..bb2f083db19ef 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -9111,26 +9111,6 @@ class SIMDThreeSameVectorMatMul<bit B, bit U, string asm, SDPatternOperator OpNo let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b|.4s\t$Rd, $Rn, $Rm}"; } -multiclass SIMDThreeSameVectorMatMulFP<bit B, bit U, string asm, SDPatternOperator OpNode> { - let Predicates = [HasNEON, HasF8F16MM] in { - def fp16 : BaseSIMDThreeSameVectorTied<1, U, 0b000, {0b1101, B}, V128, asm, ".8h", - [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd), - (v16i8 V128:$Rn), - (v16i8 V128:$Rm)))]> { - let AsmString = asm # "{\t$Rd.4h, $Rn.16b, $Rm.16b|.8h\t$Rd, $Rn, $Rm}"; - } - } - - let Predicates = [HasNEON, HasF8F32MM] in { - def fp32 : BaseSIMDThreeSameVectorTied<1, U, 0b100, {0b1101, B}, V128, asm, ".4s", - [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), - (v16i8 V128:$Rn), - (v16i8 V128:$Rm)))]> { - let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b|.4s\t$Rd, $Rn, $Rm}"; - } - } -} - //---------------------------------------------------------------------------- // ARMv8.2-A Dot Product Instructions (Indexed) class BaseSIMDThreeSameVectorIndexS<bit Q, bit U, bits<2> size, bits<4> opc, string asm, @@ -13312,18 +13292,24 @@ multiclass AtomicFPStore<bit R, bits<3> op0, string asm> { def H : BaseAtomicFPStore<FPR16, 0b01, R, op0, asm>; } -class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind> +class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind, list<dag> pattern> : BaseSIMDThreeSameVectorTied<1, 1, {size, 0}, 0b11101, - V128, asm, ".16b", []> { + V128, asm, ".16b", pattern> { let AsmString = !strconcat(asm, "{\t$Rd", kind, ", $Rn.16b, $Rm.16b", "|", kind, "\t$Rd, $Rn, $Rm}"); } -multiclass SIMDThreeSameVectorFP8MatrixMul<string asm>{ - def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h">{ +multiclass SIMDThreeSameVectorFP8MatrixMul<string asm, SDPatternOperator OpNode>{ + def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h", + [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { let Predicates = [HasNEON, HasF8F16MM]; } - def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s">{ + def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s", + [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { let Predicates = [HasNEON, HasF8F32MM]; } } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index a9f29a97a6028..08ef05f9ace31 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1731,20 +1731,10 @@ def BFCVT : BF16ToSinglePrecision<"bfcvt">; let Predicates = [HasMatMulInt8] in { def SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>; def UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>; -defm FMMLA : SIMDThreeSameVectorMatMulFP<1, 1, "fmmla", int_aarch64_neon_fmmla>; def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>; defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", AArch64usdot>; defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", AArch64usdot>; -// FMMLA fp16 -def : Pat<(v8f16 (int_aarch64_neon_fmmla - (v8f16 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm))), - (FMMLAfp16 V128:$Rd, V128:$Rn, V128:$Rm)>; - -// FMMLA fp32 -def : Pat<(v4f32 (int_aarch64_neon_fmmla - (v4f32 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm))), - (FMMLAfp32 V128:$Rd, V128:$Rn, V128:$Rm)>; // sudot lane has a pattern where usdot is expected (there is no sudot). // The second operand is used in the dup operation to repeat the indexed @@ -11426,7 +11416,7 @@ let Predicates = [HasF16F32MM] in defm FMMLA : SIMDThreeSameVectorFMLAWiden<"fmmla">; let Uses = [FPMR, FPCR] in - defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">; + defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla", int_aarch64_neon_fmmla>; //===----------------------------------------------------------------------===// // Contention Management Hints (FEAT_CMH) diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll index 79852f2079eca..8d1abdd5380db 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll @@ -5,7 +5,7 @@ define <8 x half> @fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: fmmla.v8f16.v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmmla v0.4h, v1.16b, v2.16b +; CHECK-NEXT: fmmla v0.8h, v1.16b, v2.16b ; CHECK-NEXT: ret entry: %vfmmla1.i = tail call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) #3 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
