https://github.com/jthackray updated https://github.com/llvm/llvm-project/pull/187046
>From 5b75824a9018ff354cbd4885ec0390357131f786 Mon Sep 17 00:00:00 2001 From: Jonathan Thackray <[email protected]> Date: Fri, 13 Mar 2026 15:35:37 +0000 Subject: [PATCH 1/2] [AArch64][clang][llvm] Add support for Armv9.7-A lookup table intrinsics Add support for the following Armv9.7-A Lookup Table (lut) instruction intrinsics: SVE2.3 ```c // Variant is also available for: _u8 _mf8 svint8_t svluti6[_s8](svint8x2_t table, svuint8_t indices); ``` SVE2.3 and SME2.3 ``` c // Variants are also available for _u16_x2 and _f16_x2. svint16_t svluti6_lane[_s16_x2](svint16x2_t table, svuint8_t indices, uint64_t imm_idx); ``` SME2.3 ```c // Variant are also available for: _u16, _f16 and _bf16. svint16x4_t svluti6_lane_s16_x4[_s16_x2](svint16x2_t table, svuint8x2_t indices, uint64_t imm_idx); // Variants are also available for: _u8 and _mf8. svint8x4_t svluti6_zt_s8_x4(uint64_t zt0, svuint8x3_t zn) __arm_streaming __arm_in("zt0"); // Variants are also available for: _u8 and _mf8. svint8_t svluti6_zt_s8(uint64_t zt0, svuint8_t zn) __arm_streaming __arm_in("zt0"); ``` --- clang/include/clang/Basic/arm_sme.td | 6 + clang/include/clang/Basic/arm_sve.td | 9 + clang/lib/Basic/Targets/AArch64.cpp | 29 +++ clang/lib/Basic/Targets/AArch64.h | 2 + .../sme2p3-intrinsics/acle_sme2p3_luti6.c | 175 ++++++++++++++++++ .../sve2p3-intrinsics/acle_sve2p3_luti6.c | 112 +++++++++++ .../Preprocessor/aarch64-target-features.c | 23 +++ .../acle_sme2p3_imm.c | 21 +++ .../acle_sme2p3_target.c | 20 ++ .../acle_sme2p3_target_lane.c | 16 ++ .../acle_sve2p3_imm.cpp | 24 +++ .../acle_sve2p3_target.c | 19 ++ .../acle_sve2p3_target_lane.c | 14 ++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 32 ++++ .../Target/AArch64/AArch64ISelDAGToDAG.cpp | 102 ++++++++++ llvm/lib/Target/AArch64/AArch64InstrInfo.td | 10 +- .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 3 + .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 12 +- llvm/lib/Target/AArch64/SVEInstrFormats.td | 11 ++ .../AArch64/sme2p3-intrinsics-luti6.ll | 105 +++++++++++ .../AArch64/sve2p3-intrinsics-luti6.ll | 55 ++++++ .../test/Verifier/AArch64/luti6-intrinsics.ll | 79 ++++++++ 22 files changed, 876 insertions(+), 3 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c create mode 100644 clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c create mode 100644 clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_imm.c create mode 100644 clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c create mode 100644 clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c create mode 100644 clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp create mode 100644 clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c create mode 100644 clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c create mode 100644 llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll create mode 100644 llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll create mode 100644 llvm/test/Verifier/AArch64/luti6-intrinsics.ll diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 032c588966032..8de360fca5f5e 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -981,6 +981,12 @@ let SMETargetGuard = "sme-lutv2" in { def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2.u", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>; } +let SMETargetGuard = "sme2p3" in { + def SVLUTI6_ZT : SInst<"svluti6_zt_{d}", "diu", "cUcm", MergeNone, "aarch64_sme_luti6_zt", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>; + def SVLUTI6_ZT_X4 : SInst<"svluti6_zt_{d}_x4", "4i3.u", "cUcm", MergeNone, "aarch64_sme_luti6_zt_x4", [IsOverloadNone, IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>; + def SVLUTI6_LANE_X4 : SInst<"svluti6_lane[_{d}_x4]", "42.d2.[i", "sUshb", MergeNone, "aarch64_sme_luti6_lane_x4", [IsStreaming], [ImmCheck<2, ImmCheck0_1>]>; +} + let SMETargetGuard = "sme-f8f32" in { def SVMOPA_FP8_ZA32 : Inst<"svmopa_za32[_mf8]_m", "viPPdd>", "m", MergeNone, "aarch64_sme_fp8_fmopa_za32", [IsStreaming, IsInOutZA, IsOverloadNone], [ImmCheck<0, ImmCheck0_3>]>; diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index be3cd8a76503b..b4080a8456dd5 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -1876,6 +1876,15 @@ let SVETargetGuard = "(sve2|sme2),lut", SMETargetGuard = "sme2,lut" in { def SVLUTI4_x2 : SInst<"svluti4_lane[_{d}_x2]", "d2.d[i", "sUshb", MergeNone, "aarch64_sve_luti4_lane_x2", [VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_3>]>; } +let SVETargetGuard = "sve2p3", SMETargetGuard = InvalidMode in { + def SVLUTI6 : SInst<"svluti6[_{d}]", "d2u", "cUcm", MergeNone, "aarch64_sve_luti6", [IsOverloadNone]>; +} + +let SVETargetGuard = "sve2p3", SMETargetGuard = "sme2p3" in { + def SVLUTI6_x2_I16 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "sUs", MergeNone, "aarch64_sve_luti6_lane_x2_i16", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>; + def SVLUTI6_x2_F16 : SInst<"svluti6_lane[_{d}_x2]", "d2.d[i", "h", MergeNone, "aarch64_sve_luti6_lane_x2_f16", [IsOverloadNone, VerifyRuntimeMode], [ImmCheck<2, ImmCheck0_1>]>; +} + //////////////////////////////////////////////////////////////////////////////// // SVE2 - Optional diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index f7ed15be75cd8..9d4085e5ebbf4 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -495,6 +495,9 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts, if (HasSVE2p1) Builder.defineMacro("__ARM_FEATURE_SVE2p1", "1"); + if (HasSVE2p3) + Builder.defineMacro("__ARM_FEATURE_SVE2p3", "1"); + if (HasSVE2 && HasSVEAES) Builder.defineMacro("__ARM_FEATURE_SVE2_AES", "1"); @@ -521,6 +524,9 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts, if (HasSME2p1) Builder.defineMacro("__ARM_FEATURE_SME2p1", "1"); + if (HasSME2p3) + Builder.defineMacro("__ARM_FEATURE_SME2p3", "1"); + if (HasSMEF16F16) Builder.defineMacro("__ARM_FEATURE_SME_F16F16", "1"); @@ -900,9 +906,11 @@ void AArch64TargetInfo::computeFeatureLookup() { .Case("sve2-sha3", FPU & SveMode && HasSVE2SHA3) .Case("sve2-sm4", FPU & SveMode && HasSVE2SM4) .Case("sve2p1", FPU & SveMode && HasSVE2p1) + .Case("sve2p3", FPU & SveMode && HasSVE2p3) .Case("sme", HasSME) .Case("sme2", HasSME2) .Case("sme2p1", HasSME2p1) + .Case("sme2p3", HasSME2p3) .Case("sme-f64f64", HasSMEF64F64) .Case("sme-i16i64", HasSMEI16I64) .Case("sme-fa64", HasSMEFA64) @@ -1008,6 +1016,15 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features, HasSVE2 = true; HasSVE2p1 = true; } + if (Feature == "+sve2p3") { + FPU |= NeonMode; + FPU |= SveMode; + HasFullFP16 = true; + HasSVE2 = true; + HasSVE2p1 = true; + HasSVE2p2 = true; + HasSVE2p3 = true; + } if (Feature == "+sve-aes") { FPU |= NeonMode; HasFullFP16 = true; @@ -1064,6 +1081,18 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features, HasBFloat16 = true; HasFullFP16 = true; } + if (Feature == "+sme2p3") { + HasSME = true; + HasSME2 = true; + HasSVE2 = true; + HasSVE2p1 = true; + HasSVE2p2 = true; + HasSME2p1 = true; + HasSME2p2 = true; + HasSME2p3 = true; + HasBFloat16 = true; + HasFullFP16 = true; + } if (Feature == "+sme-f64f64") { HasSME = true; HasSMEF64F64 = true; diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h index f9dffed8769ef..4191070a4786d 100644 --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -85,6 +85,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo { bool HasBFloat16 = false; bool HasSVE2 = false; bool HasSVE2p1 = false; + bool HasSVE2p3 = false; bool HasSVEAES = false; bool HasSVE2SHA3 = false; bool HasSVE2SM4 = false; @@ -110,6 +111,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo { bool HasSMEF16F16 = false; bool HasSMEB16B16 = false; bool HasSME2p1 = false; + bool HasSME2p3 = false; bool HasFP8 = false; bool HasFP8FMA = false; bool HasFP8DOT2 = false; diff --git a/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c new file mode 100644 index 0000000000000..ae5fb1f64d0fc --- /dev/null +++ b/clang/test/CodeGen/AArch64/sme2p3-intrinsics/acle_sme2p3_luti6.c @@ -0,0 +1,175 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s + +#include <arm_sme.h> + +// CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti6_lane_s16_x4( +// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1) +// CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z24test_svluti6_lane_s16_x411svint16x2_t11svuint8x2_t( +// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1) +// CPP-CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]] +// +svint16x4_t test_svluti6_lane_s16_x4(svint16x2_t table, svuint8x2_t indices) + __arm_streaming { + return svluti6_lane_s16_x4(table, indices, 1); +} + +// CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_svluti6_lane_u16_x4( +// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0) +// CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @_Z24test_svluti6_lane_u16_x412svuint16x2_t11svuint8x2_t( +// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0) +// CPP-CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]] +// +svuint16x4_t test_svluti6_lane_u16_x4(svuint16x2_t table, svuint8x2_t indices) + __arm_streaming { + return svluti6_lane_u16_x4(table, indices, 0); +} + +// CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_svluti6_lane_f16_x4( +// CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1) +// CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @_Z24test_svluti6_lane_f16_x413svfloat16x2_t11svuint8x2_t( +// CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.nxv8f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 1) +// CPP-CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]] +// +svfloat16x4_t test_svluti6_lane_f16_x4(svfloat16x2_t table, svuint8x2_t indices) + __arm_streaming { + return svluti6_lane_f16_x4(table, indices, 1); +} + +// CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_svluti6_lane_bf16_x4( +// CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0) +// CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @_Z25test_svluti6_lane_bf16_x414svbfloat16x2_t11svuint8x2_t( +// CPP-CHECK-SAME: <vscale x 8 x bfloat> [[TABLE_COERCE0:%.*]], <vscale x 8 x bfloat> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16(<vscale x 8 x bfloat> [[TABLE_COERCE0]], <vscale x 8 x bfloat> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], i32 0) +// CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]] +// +svbfloat16x4_t test_svluti6_lane_bf16_x4(svbfloat16x2_t table, svuint8x2_t indices) + __arm_streaming { + return svluti6_lane_bf16_x4(table, indices, 0); +} + +// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_zt_s8( +// CHECK-SAME: <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> [[INDICES]]) +// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z18test_svluti6_zt_s8u11__SVUint8_t( +// CPP-CHECK-SAME: <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> [[INDICES]]) +// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +svint8_t test_svluti6_zt_s8(svuint8_t indices) __arm_streaming __arm_in("zt0") { + return svluti6_zt_s8(0, indices); +} + +// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_zt_u8( +// CHECK-SAME: <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> [[INDICES]]) +// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z18test_svluti6_zt_u8u11__SVUint8_t( +// CPP-CHECK-SAME: <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> [[INDICES]]) +// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +svuint8_t test_svluti6_zt_u8(svuint8_t indices) __arm_streaming __arm_in("zt0") { + return svluti6_zt_u8(0, indices); +} + +// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_zt_mf8( +// CHECK-SAME: <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> [[INDICES]]) +// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z19test_svluti6_zt_mf8u11__SVUint8_t( +// CPP-CHECK-SAME: <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt(i32 0, <vscale x 16 x i8> [[INDICES]]) +// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +svmfloat8_t test_svluti6_zt_mf8(svuint8_t indices) __arm_streaming __arm_in("zt0") { + return svluti6_zt_mf8(0, indices); +} + +// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svluti6_zt_u8_x4( +// CHECK-SAME: <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]]) +// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z21test_svluti6_zt_u8_x411svuint8x3_t( +// CPP-CHECK-SAME: <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]]) +// CPP-CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]] +// +svuint8x4_t test_svluti6_zt_u8_x4(svuint8x3_t indices) + __arm_streaming __arm_in("zt0") { + return svluti6_zt_u8_x4(0, indices); +} + +// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svluti6_zt_s8_x4( +// CHECK-SAME: <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]]) +// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z21test_svluti6_zt_s8_x411svuint8x3_t( +// CPP-CHECK-SAME: <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]]) +// CPP-CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]] +// +svint8x4_t test_svluti6_zt_s8_x4(svuint8x3_t indices) + __arm_streaming __arm_in("zt0") { + return svluti6_zt_s8_x4(0, indices); +} + +// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_svluti6_zt_mf8_x4( +// CHECK-SAME: <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]]) +// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z22test_svluti6_zt_mf8_x411svuint8x3_t( +// CPP-CHECK-SAME: <vscale x 16 x i8> [[INDICES_COERCE0:%.*]], <vscale x 16 x i8> [[INDICES_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES_COERCE2:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4(i32 0, <vscale x 16 x i8> [[INDICES_COERCE0]], <vscale x 16 x i8> [[INDICES_COERCE1]], <vscale x 16 x i8> [[INDICES_COERCE2]]) +// CPP-CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]] +// +svmfloat8x4_t test_svluti6_zt_mf8_x4(svuint8x3_t indices) + __arm_streaming __arm_in("zt0") { + return svluti6_zt_mf8_x4(0, indices); +} diff --git a/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c new file mode 100644 index 0000000000000..a806ef0b13c20 --- /dev/null +++ b/clang/test/CodeGen/AArch64/sve2p3-intrinsics/acle_sve2p3_luti6.c @@ -0,0 +1,112 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s --check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -S -O1 -Werror -o /dev/null %s + +#include <arm_sve.h> + +#ifdef SVE_OVERLOADED_FORMS +#define SVE_ACLE_FUNC(A1, A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1, A2) A1##A2 +#endif + +// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_s8( +// CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]]) +// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_s810svint8x2_tu11__SVUint8_t( +// CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]]) +// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +svint8_t test_svluti6_s8(svint8x2_t table, svuint8_t indices) { + return SVE_ACLE_FUNC(svluti6, _s8)(table, indices); +} + +// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_u8( +// CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]]) +// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z15test_svluti6_u811svuint8x2_tu11__SVUint8_t( +// CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]]) +// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +svuint8_t test_svluti6_u8(svuint8x2_t table, svuint8_t indices) { + return SVE_ACLE_FUNC(svluti6, _u8)(table, indices); +} + +// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svluti6_mf8( +// CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]]) +// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z16test_svluti6_mf813svmfloat8x2_tu11__SVUint8_t( +// CPP-CHECK-SAME: <vscale x 16 x i8> [[TABLE_COERCE0:%.*]], <vscale x 16 x i8> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6(<vscale x 16 x i8> [[TABLE_COERCE0]], <vscale x 16 x i8> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]]) +// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] +// +svmfloat8_t test_svluti6_mf8(svmfloat8x2_t table, svuint8_t indices) { + return SVE_ACLE_FUNC(svluti6, _mf8)(table, indices); +} + +// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_s16_x2( +// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1) +// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_s16_x211svint16x2_tu11__SVUint8_t( +// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1) +// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] +// +svint16_t test_svluti6_lane_s16_x2(svint16x2_t table, svuint8_t indices) { + return SVE_ACLE_FUNC(svluti6_lane, _s16_x2)(table, indices, 1); +} + +// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svluti6_lane_u16_x2( +// CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0) +// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z24test_svluti6_lane_u16_x212svuint16x2_tu11__SVUint8_t( +// CPP-CHECK-SAME: <vscale x 8 x i16> [[TABLE_COERCE0:%.*]], <vscale x 8 x i16> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16(<vscale x 8 x i16> [[TABLE_COERCE0]], <vscale x 8 x i16> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 0) +// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] +// +svuint16_t test_svluti6_lane_u16_x2(svuint16x2_t table, svuint8_t indices) { + return SVE_ACLE_FUNC(svluti6_lane, _u16_x2)(table, indices, 0); +} + +// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svluti6_lane_f16_x2( +// CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1) +// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z24test_svluti6_lane_f16_x213svfloat16x2_tu11__SVUint8_t( +// CPP-CHECK-SAME: <vscale x 8 x half> [[TABLE_COERCE0:%.*]], <vscale x 8 x half> [[TABLE_COERCE1:%.*]], <vscale x 16 x i8> [[INDICES:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16(<vscale x 8 x half> [[TABLE_COERCE0]], <vscale x 8 x half> [[TABLE_COERCE1]], <vscale x 16 x i8> [[INDICES]], i32 1) +// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +svfloat16_t test_svluti6_lane_f16_x2(svfloat16x2_t table, svuint8_t indices) { + return SVE_ACLE_FUNC(svluti6_lane, _f16_x2)(table, indices, 1); +} diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c index 60ddaad639d48..6316b25befed8 100644 --- a/clang/test/Preprocessor/aarch64-target-features.c +++ b/clang/test/Preprocessor/aarch64-target-features.c @@ -827,9 +827,32 @@ // CHECK-SVE2p2: __ARM_NEON_FP 0xE // CHECK-SVE2p2: __ARM_NEON_SVE_BRIDGE 1 // +// RUN: %clang -target aarch64-none-linux-gnu -march=armv9.7-a+sve2p3 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2p3 %s +// CHECK-SVE2p3: __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1 +// CHECK-SVE2p3: __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1 +// CHECK-SVE2p3: __ARM_FEATURE_SVE 1 +// CHECK-SVE2p3: __ARM_FEATURE_SVE2 1 +// CHECK-SVE2p3: __ARM_FEATURE_SVE2p1 1 +// CHECK-SVE2p3: __ARM_FEATURE_SVE2p2 1 +// CHECK-SVE2p3: __ARM_FEATURE_SVE2p3 1 +// CHECK-SVE2p3: __ARM_NEON 1 +// CHECK-SVE2p3: __ARM_NEON_FP 0xE +// CHECK-SVE2p3: __ARM_NEON_SVE_BRIDGE 1 +// CHECK-SVE2p3-NOT: __ARM_FEATURE_SME2p3 1 +// // RUN: %clang --target=aarch64 -march=armv9-a+sme2p2 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME2p2 %s // CHECK-SME2p2: __ARM_FEATURE_LOCALLY_STREAMING 1 // CHECK-SME2p2: __ARM_FEATURE_SME 1 // CHECK-SME2p2: __ARM_FEATURE_SME2 1 // CHECK-SME2p2: __ARM_FEATURE_SME2p1 1 // CHECK-SME2p2: __ARM_FEATURE_SME2p2 1 +// +// RUN: %clang --target=aarch64 -march=armv9.7-a+sme2p3 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME2p3 %s +// CHECK-SME2p3: __ARM_FEATURE_LOCALLY_STREAMING 1 +// CHECK-SME2p3: __ARM_FEATURE_SME 1 +// CHECK-SME2p3: __ARM_FEATURE_SME2 1 +// CHECK-SME2p3: __ARM_FEATURE_SME2p1 1 +// CHECK-SME2p3: __ARM_FEATURE_SME2p2 1 +// CHECK-SME2p3: __ARM_FEATURE_SME2p3 1 +// CHECK-SME2p3: __ARM_FEATURE_SVE2p1 1 +// CHECK-SME2p3: __ARM_FEATURE_SVE2p2 1 diff --git a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_imm.c b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_imm.c new file mode 100644 index 0000000000000..8883ea3580fb2 --- /dev/null +++ b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_imm.c @@ -0,0 +1,21 @@ +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme2p3 -target-feature +bf16 -fsyntax-only -verify %s + +#include <arm_sme.h> + +void test_range_0_0(void) __arm_streaming __arm_in("zt0") { + svluti6_zt_s8(1, svundef_u8()); // expected-error {{argument value 1 is outside the valid range [0, 0]}} + svluti6_zt_u8_x4(1, svcreate3_u8(svundef_u8(), svundef_u8(), svundef_u8())); // expected-error {{argument value 1 is outside the valid range [0, 0]}} +} + +void test_range_0_1(void) __arm_streaming { + svluti6_lane_s16_x4(svcreate2_s16(svundef_s16(), svundef_s16()), // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svcreate2_u8(svundef_u8(), svundef_u8()), -1); + svluti6_lane_u16_x4(svcreate2_u16(svundef_u16(), svundef_u16()), // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svcreate2_u8(svundef_u8(), svundef_u8()), 2); + svluti6_lane_f16_x4(svcreate2_f16(svundef_f16(), svundef_f16()), // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svcreate2_u8(svundef_u8(), svundef_u8()), -1); + svluti6_lane_bf16_x4(svcreate2_bf16(svundef_bf16(), svundef_bf16()), // expected-error {{argument value 2 is outside the valid range [0, 1]}} + svcreate2_u8(svundef_u8(), svundef_u8()), 2); +} diff --git a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c new file mode 100644 index 0000000000000..2cffc1344bfe1 --- /dev/null +++ b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target.c @@ -0,0 +1,20 @@ +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +bf16 -verify -emit-llvm -o - %s + +#include <arm_sme.h> + +svint8_t missing_sme2p3_zt(svuint8_t indices) __arm_streaming __arm_in("zt0") { + return svluti6_zt_s8(0, indices); // expected-error {{'svluti6_zt_s8' needs target feature sme,sme2p3}} +} + +__attribute__((target("sme2p3"))) +svint8_t has_sme2p3_zt(svuint8_t indices) __arm_streaming __arm_in("zt0") { + return svluti6_zt_s8(0, indices); +} + +__attribute__((target("sme2p3"))) +svfloat16_t has_sme2p3_implied_sme2p2(svbool_t pg, svfloat16_t op) + __arm_streaming { + return svcompact_f16(pg, op); +} diff --git a/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c new file mode 100644 index 0000000000000..1a06663a9aab7 --- /dev/null +++ b/clang/test/Sema/aarch64-sme2p3-intrinsics/acle_sme2p3_target_lane.c @@ -0,0 +1,16 @@ +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +bf16 -verify -emit-llvm -o - %s + +#include <arm_sme.h> + +svbfloat16x4_t missing_sme2p3_lane(svbfloat16x2_t table, svuint8x2_t indices) + __arm_streaming { + return svluti6_lane_bf16_x4(table, indices, 1); // expected-error {{'svluti6_lane_bf16_x4' needs target feature sme,sme2p3}} +} + +__attribute__((target("sme2p3,bf16"))) +svbfloat16x4_t has_sme2p3_lane(svbfloat16x2_t table, svuint8x2_t indices) + __arm_streaming { + return svluti6_lane_bf16_x4(table, indices, 0); +} diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp new file mode 100644 index 0000000000000..8bbb0211b0bbb --- /dev/null +++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_imm.cpp @@ -0,0 +1,24 @@ +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -fsyntax-only -verify %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2p3 -fsyntax-only -verify %s + +#ifdef SVE_OVERLOADED_FORMS +#define SVE_ACLE_FUNC(A1, A2_UNUSED) A1 +#else +#define SVE_ACLE_FUNC(A1, A2) A1##A2 +#endif + +#include <arm_sve.h> + +void test_range_0_1() { + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}} + SVE_ACLE_FUNC(svluti6_lane, _s16_x2)(svcreate2_s16(svundef_s16(), svundef_s16()), + svundef_u8(), -1); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}} + SVE_ACLE_FUNC(svluti6_lane, _u16_x2)(svcreate2_u16(svundef_u16(), svundef_u16()), + svundef_u8(), 2); + // expected-error-re@+1 {{argument value {{[0-9]+}} is outside the valid range [0, 1]}} + SVE_ACLE_FUNC(svluti6_lane, _f16_x2)(svcreate2_f16(svundef_f16(), svundef_f16()), + svundef_u8(), -1); +} diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c new file mode 100644 index 0000000000000..3b5596ac1d5a6 --- /dev/null +++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target.c @@ -0,0 +1,19 @@ +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -verify -emit-llvm -o - %s + +#include <arm_sve.h> + +void missing_sve2p3_luti6(svint8x2_t table, svuint8_t indices) { + svluti6_s8(table, indices); // expected-error {{'svluti6_s8' needs target feature sve,sve2p3}} +} + +__attribute__((target("sve2p3"))) +svint8_t has_sve2p3_luti6(svint8x2_t table, svuint8_t indices) { + return svluti6_s8(table, indices); +} + +__attribute__((target("sve2p3"))) +svfloat32_t has_sve2p3_implied_sve2p2(svbool_t pg, svfloat16_t op) { + return svcvtlt_f32_f16_z(pg, op); +} diff --git a/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c new file mode 100644 index 0000000000000..6a2465f4027fc --- /dev/null +++ b/clang/test/Sema/aarch64-sve2p3-intrinsics/acle_sve2p3_target_lane.c @@ -0,0 +1,14 @@ +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -verify -emit-llvm -o - %s + +#include <arm_sve.h> + +svfloat16_t missing_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) { + return svluti6_lane_f16_x2(table, indices, 1); // expected-error {{'svluti6_lane_f16_x2' needs target feature (sve,sve2p3)|(sme,sme2p3)}} +} + +__attribute__((target("sve2p3"))) +svfloat16_t has_sve2p3_luti6_lane(svfloat16x2_t table, svuint8_t indices) { + return svluti6_lane_f16_x2(table, indices, 0); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 75929cbc222ad..325c606149bfc 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1051,6 +1051,7 @@ def llvm_nxv4i1_ty : LLVMType<nxv4i1>; def llvm_nxv8i1_ty : LLVMType<nxv8i1>; def llvm_nxv16i1_ty : LLVMType<nxv16i1>; def llvm_nxv16i8_ty : LLVMType<nxv16i8>; +def llvm_nxv8i16_ty : LLVMType<nxv8i16>; def llvm_nxv4i32_ty : LLVMType<nxv4i32>; def llvm_nxv2i64_ty : LLVMType<nxv2i64>; def llvm_nxv8f16_ty : LLVMType<nxv8f16>; @@ -2797,12 +2798,31 @@ def int_aarch64_sve_tbx : AdvSIMD_SVE2_TBX_Intrinsic<[IntrSpeculatable]>; def int_aarch64_sve_luti2_lane : SVE2_LUTI_Inrinsic<[IntrSpeculatable]>; def int_aarch64_sve_luti4_lane : SVE2_LUTI_Inrinsic<[IntrSpeculatable]>; +def int_aarch64_sve_luti6 : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty], + [llvm_nxv16i8_ty, + llvm_nxv16i8_ty, + llvm_nxv16i8_ty], + [IntrNoMem, IntrSpeculatable]>; def int_aarch64_sve_luti4_lane_x2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_nxv16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>, IntrSpeculatable]>; +def int_aarch64_sve_luti6_lane_x2_i16 + : DefaultAttrsIntrinsic<[llvm_nxv8i16_ty], + [llvm_nxv8i16_ty, + llvm_nxv8i16_ty, + llvm_nxv16i8_ty, + llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<3>>, IntrSpeculatable]>; +def int_aarch64_sve_luti6_lane_x2_f16 + : DefaultAttrsIntrinsic<[llvm_nxv8f16_ty], + [llvm_nxv8f16_ty, + llvm_nxv8f16_ty, + llvm_nxv16i8_ty, + llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<3>>, IntrSpeculatable]>; // // SVE2 - Optional bit permutation @@ -3957,6 +3977,9 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sme_luti4_lane_zt : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrInaccessibleMemOnly, IntrReadMem]>; + def int_aarch64_sme_luti6_zt + : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty], [llvm_i32_ty, llvm_nxv16i8_ty], + [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrReadMem]>; // Lookup table expand two registers // @@ -3978,11 +4001,20 @@ let TargetPrefix = "aarch64" in { : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty], [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrInaccessibleMemOnly, IntrReadMem]>; + def int_aarch64_sme_luti6_lane_x4 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_i32_ty], + [ImmArg<ArgIndex<4>>, IntrNoMem, IntrSpeculatable]>; def int_aarch64_sme_luti4_zt_x4 : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty], [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrReadMem]>; + def int_aarch64_sme_luti6_zt_x4 + : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty, llvm_nxv16i8_ty, + llvm_nxv16i8_ty, llvm_nxv16i8_ty], + [llvm_i32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty], + [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrReadMem]>; // diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 1b706411791e9..79061be0a525f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -414,8 +414,12 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { void SelectMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs, unsigned Opc, uint32_t MaxImm); + void SelectMultiVectorLutiLaneTuple(SDNode *Node, unsigned NumOutVecs, + unsigned Opc, uint32_t MaxImm); void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc); + void SelectMultiVectorLutiZT(SDNode *Node, unsigned NumOutVecs, unsigned Opc, + unsigned NumInVecs); template <unsigned MaxIdx, unsigned Scale> bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) { @@ -2242,6 +2246,51 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node, CurDAG->RemoveDeadNode(Node); } +void AArch64DAGToDAGISel::SelectMultiVectorLutiLaneTuple(SDNode *Node, + unsigned NumOutVecs, + unsigned Opc, + uint32_t MaxImm) { + const bool HasChain = Node->getOpcode() == ISD::INTRINSIC_W_CHAIN; + const unsigned BaseOp = HasChain ? 1 : 0; + const unsigned t0 = BaseOp + 1; + const unsigned t1 = BaseOp + 2; + const unsigned i0 = BaseOp + 3; + const unsigned i1 = BaseOp + 4; + const unsigned ImmOp = BaseOp + 5; + + SDValue ImmVal = Node->getOperand(ImmOp); + if (auto *Imm = dyn_cast<ConstantSDNode>(ImmVal)) + if (Imm->getZExtValue() > MaxImm) + return; + + SDLoc DL(Node); + EVT VT = Node->getValueType(0); + SmallVector<SDValue, 4> Ops = { + createZTuple({Node->getOperand(t0), Node->getOperand(t1)}), + createZTuple({Node->getOperand(i0), Node->getOperand(i1)}), + Node->getOperand(ImmOp), + }; + + SDNode *Instruction; + if (HasChain) { + Ops.push_back(Node->getOperand(0)); + Instruction = + CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops); + } else { + Instruction = CurDAG->getMachineNode(Opc, DL, MVT::Untyped, Ops); + } + SDValue SuperReg(Instruction, 0); + + for (unsigned i = 0; i < NumOutVecs; ++i) + ReplaceUses(SDValue(Node, i), CurDAG->getTargetExtractSubreg( + AArch64::zsub0 + i, DL, VT, SuperReg)); + + if (HasChain) + ReplaceUses(SDValue(Node, NumOutVecs), SDValue(Instruction, 1)); + + CurDAG->RemoveDeadNode(Node); +} + void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc) { @@ -2271,6 +2320,50 @@ void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, CurDAG->RemoveDeadNode(Node); } +void AArch64DAGToDAGISel::SelectMultiVectorLutiZT(SDNode *Node, + unsigned NumOutVecs, + unsigned Opc, + unsigned NumInVecs) { + const unsigned ChainOp = 0; + const unsigned ZtOp = 2; + const unsigned FirstVecOp = 3; + + SDValue ZtValue; + if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(ZtOp), ZtValue)) + return; + + SDValue ZTuple; + switch (NumInVecs) { + case 2: + ZTuple = createZMulTuple( + {Node->getOperand(FirstVecOp), Node->getOperand(FirstVecOp + 1)}); + break; + case 3: + ZTuple = createZTuple({Node->getOperand(FirstVecOp), + Node->getOperand(FirstVecOp + 1), + Node->getOperand(FirstVecOp + 2)}); + break; + default: + llvm_unreachable("unexpected LUTI ZT tuple width"); + } + + SDValue Ops[] = {ZtValue, ZTuple, Node->getOperand(ChainOp)}; + + SDLoc DL(Node); + EVT VT = Node->getValueType(0); + + SDNode *Instruction = + CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops); + SDValue SuperReg(Instruction, 0); + + for (unsigned i = 0; i < NumOutVecs; ++i) + ReplaceUses(SDValue(Node, i), CurDAG->getTargetExtractSubreg( + AArch64::zsub0 + i, DL, VT, SuperReg)); + + ReplaceUses(SDValue(Node, NumOutVecs), SDValue(Instruction, 1)); + CurDAG->RemoveDeadNode(Node); +} + void AArch64DAGToDAGISel::SelectClamp(SDNode *N, unsigned NumVecs, unsigned Op) { SDLoc DL(N); @@ -5903,6 +5996,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z); return; } + case Intrinsic::aarch64_sme_luti6_zt_x4: { + SelectMultiVectorLutiZT(Node, 4, AArch64::LUTI6_4ZT3Z, 3); + return; + } case Intrinsic::aarch64_sve_fp8_cvtl1_x2: if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::FP>( Node->getValueType(0), @@ -5993,6 +6090,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::SRSHL_VG4_4ZZ_S, AArch64::SRSHL_VG4_4ZZ_D})) SelectDestructiveMultiIntrinsic(Node, 4, false, Op); return; + case Intrinsic::aarch64_sme_luti6_lane_x4: + if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::AnyType>( + Node->getValueType(0), {0, AArch64::LUTI6_4Z2Z2ZI, 0})) + SelectMultiVectorLutiLaneTuple(Node, 4, Opc, 1); + return; case Intrinsic::aarch64_sve_urshl_single_x2: if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>( Node->getValueType(0), diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 44968b14b11a9..34915a073c9aa 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -229,7 +229,7 @@ def HasF16MM : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasF AssemblerPredicateWithAll<(all_of FeatureF16MM), "f16mm">; def HasSVE2p3 : Predicate<"Subtarget->hasSVE2p3()">, AssemblerPredicateWithAll<(all_of FeatureSVE2p3), "sve2p3">; -def HasSME2p3 : Predicate<"Subtarget->hasSME2p3()">, +def HasSME2p3 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p3()">, AssemblerPredicateWithAll<(all_of FeatureSME2p3), "sme2p3">; def HasF16F32DOT : Predicate<"Subtarget->hasF16F32DOT()">, AssemblerPredicateWithAll<(all_of FeatureF16F32DOT), "f16f32dot">; @@ -313,6 +313,14 @@ def HasNonStreamingSVE2p2_or_SME2p2 "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">, AssemblerPredicateWithAll<(any_of FeatureSVE2p2, FeatureSME2p2), "sme2p2 or sve2p2">; +def HasNonStreamingSVE2p3 + : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p3()">, + AssemblerPredicateWithAll<(all_of FeatureSVE2p3), "sve2p3">; +def HasNonStreamingSVE2p3_or_SME2p3 + : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2p3()) ||" + "(Subtarget->isStreaming() && Subtarget->hasSME2p3())">, + AssemblerPredicateWithAll<(any_of FeatureSVE2p3, FeatureSME2p3), + "sme2p3 or sve2p3">; def HasSMEF16F16_or_SMEF8F16 : Predicate<"Subtarget->isStreaming() && (Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16())">, diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 905eed50dee9a..c5d1fedcbc9ae 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -1186,6 +1186,9 @@ let Predicates = [HasSME_MOP4, HasSMEF64F64] in { //===----------------------------------------------------------------------===// let Predicates = [HasSME2p3] in { def LUTI6_ZTZ : sme2_lut_single<"luti6">; + def : Pat<(nxv16i8 (int_aarch64_sme_luti6_zt (imm_to_zt untyped:$zt), + nxv16i8:$zn)), + (LUTI6_ZTZ $zt, nxv16i8:$zn)>; def LUTI6_4ZT3Z : sme2_luti6_zt_consecutive<"luti6">; def LUTI6_S_4ZT3Z : sme2_luti6_zt_strided<"luti6">; def LUTI6_4Z2Z2ZI : sme2_luti6_vector_vg4_consecutive<"luti6">; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index c5a3bd504adf9..248b11332e2a4 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -4837,14 +4837,22 @@ let Predicates = [HasSVE2p3_or_SME2p3] in { defm SQSHRN_Z2ZI_StoH : sve_multi_vec_shift_narrow<"sqshrn", 0b000, null_frag>; defm UQSHRN_Z2ZI_StoH : sve_multi_vec_shift_narrow<"uqshrn", 0b010, null_frag>; - defm LUTI6_Z2ZZI : sve2_luti6_vector_index<"luti6">; } // End HasSME2p3orSVE2p3 +let Predicates = [HasNonStreamingSVE2p3_or_SME2p3] in { + defm LUTI6_Z2ZZI : sve2_luti6_vector_index<"luti6">; +} + //===----------------------------------------------------------------------===// // SVE2.3 instructions //===----------------------------------------------------------------------===// -let Predicates = [HasSVE2p3] in { +let Predicates = [HasNonStreamingSVE2p3] in { def LUTI6_Z2ZZ : sve2_luti6_vector<"luti6">; + def : Pat<(nxv16i8 (int_aarch64_sve_luti6 nxv16i8:$Op1, nxv16i8:$Op2, + nxv16i8:$Op3)), + (LUTI6_Z2ZZ (REG_SEQUENCE ZPR2, nxv16i8:$Op1, zsub0, + nxv16i8:$Op2, zsub1), + nxv16i8:$Op3)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 8a3f52090ab4c..ddec0a21b0ccb 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -11358,6 +11358,17 @@ multiclass sve2_luti6_vector_index<string mnemonic> { bit idx; let Inst{23} = idx; } + + def : Pat<(nxv8i16 (int_aarch64_sve_luti6_lane_x2_i16 nxv8i16:$Op1, nxv8i16:$Op2, + nxv16i8:$Op3, (i32 timm32_0_1:$Op4))), + (nxv8i16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0, + nxv8i16:$Op2, zsub1), + nxv16i8:$Op3, timm32_0_1:$Op4))>; + def : Pat<(nxv8f16 (int_aarch64_sve_luti6_lane_x2_f16 nxv8f16:$Op1, nxv8f16:$Op2, + nxv16i8:$Op3, (i32 timm32_0_1:$Op4))), + (nxv8f16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0, + nxv8f16:$Op2, zsub1), + nxv16i8:$Op3, timm32_0_1:$Op4))>; } // Look up table diff --git a/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll new file mode 100644 index 0000000000000..07fb62baa58cd --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2p3-intrinsics-luti6.ll @@ -0,0 +1,105 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -verify-machineinstrs -force-streaming -mtriple=aarch64-none-linux-gnu -mattr=+sme2p3 < %s | FileCheck %s + +target triple = "aarch64-none-linux-gnu" + +define <vscale x 16 x i8> @luti6_zt_i8(<vscale x 16 x i8> %x) #0 { +; CHECK-LABEL: luti6_zt_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: luti6 z0.b, zt0, z0 +; CHECK-NEXT: ret + %res = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt( + i32 0, <vscale x 16 x i8> %x) + ret <vscale x 16 x i8> %res +} + +define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, + <vscale x 16 x i8> } @luti6_zt_i8_x4(<vscale x 16 x i8> %a, + <vscale x 16 x i8> %b, + <vscale x 16 x i8> %c) #0 { +; CHECK-LABEL: luti6_zt_i8_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: luti6 { z0.b - z3.b }, zt0, { z0 - z2 } +; CHECK-NEXT: ret + %res = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, + <vscale x 16 x i8>, <vscale x 16 x i8> } + @llvm.aarch64.sme.luti6.zt.x4( + i32 0, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, + <vscale x 16 x i8> %c) + ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, + <vscale x 16 x i8> } %res +} + +define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, + <vscale x 8 x i16> } @luti6_i16_x4(<vscale x 8 x i16> %a, + <vscale x 8 x i16> %b, + <vscale x 16 x i8> %x, + <vscale x 16 x i8> %y) #0 { +; CHECK-LABEL: luti6_i16_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z2, z3 }[1] +; CHECK-NEXT: ret + %res = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, + <vscale x 8 x i16>, <vscale x 8 x i16> } + @llvm.aarch64.sme.luti6.lane.x4.nxv8i16( + <vscale x 8 x i16> %a, <vscale x 8 x i16> %b, + <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 1) + ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, + <vscale x 8 x i16> } %res +} + +define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, + <vscale x 8 x bfloat> } @luti6_bf16_x4(<vscale x 8 x bfloat> %a, + <vscale x 8 x bfloat> %b, + <vscale x 16 x i8> %x, + <vscale x 16 x i8> %y) #0 { +; CHECK-LABEL: luti6_bf16_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z2, z3 }[0] +; CHECK-NEXT: ret + %res = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, + <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } + @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16( + <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, + <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 0) + ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, + <vscale x 8 x bfloat> } %res +} + +define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, + <vscale x 8 x half> } @luti6_f16_x4(<vscale x 8 x half> %a, + <vscale x 8 x half> %b, + <vscale x 16 x i8> %x, + <vscale x 16 x i8> %y) #0 { +; CHECK-LABEL: luti6_f16_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z2, z3 }[1] +; CHECK-NEXT: ret + %res = tail call { <vscale x 8 x half>, <vscale x 8 x half>, + <vscale x 8 x half>, <vscale x 8 x half> } + @llvm.aarch64.sme.luti6.lane.x4.nxv8f16( + <vscale x 8 x half> %a, <vscale x 8 x half> %b, + <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 1) + ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, + <vscale x 8 x half> } %res +} + +declare <vscale x 16 x i8> @llvm.aarch64.sme.luti6.zt( + i32, <vscale x 16 x i8>) +declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, + <vscale x 16 x i8> } @llvm.aarch64.sme.luti6.zt.x4( + i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) +declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, + <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16( + <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 16 x i8>, + <vscale x 16 x i8>, i32) +declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, + <vscale x 8 x bfloat> } @llvm.aarch64.sme.luti6.lane.x4.nxv8bf16( + <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 16 x i8>, + <vscale x 16 x i8>, i32) +declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, + <vscale x 8 x half> } @llvm.aarch64.sme.luti6.lane.x4.nxv8f16( + <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 16 x i8>, + <vscale x 16 x i8>, i32) + +attributes #0 = { "target-features"="+sme2p3" } diff --git a/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll new file mode 100644 index 0000000000000..ab89e87df66d2 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p3-intrinsics-luti6.ll @@ -0,0 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve2p3 < %s | FileCheck %s + +target triple = "aarch64-none-linux-gnu" + +define <vscale x 16 x i8> @luti6_i8(<vscale x 16 x i8> %a, +; CHECK-LABEL: luti6_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: luti6 z0.b, { z0.b, z1.b }, z2 +; CHECK-NEXT: ret + <vscale x 16 x i8> %b, + <vscale x 16 x i8> %idx) { + %res = tail call <vscale x 16 x i8> @llvm.aarch64.sve.luti6( + <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %idx) + ret <vscale x 16 x i8> %res +} + +define <vscale x 8 x i16> @luti6_i16_x2(<vscale x 8 x i16> %a, +; CHECK-LABEL: luti6_i16_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: luti6 z0.h, { z0.h, z1.h }, z2[1] +; CHECK-NEXT: ret + <vscale x 8 x i16> %b, + <vscale x 16 x i8> %idx) { + %res = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16( + <vscale x 8 x i16> %a, <vscale x 8 x i16> %b, + <vscale x 16 x i8> %idx, i32 1) + ret <vscale x 8 x i16> %res +} + +define <vscale x 8 x half> @luti6_f16_x2(<vscale x 8 x half> %a, +; CHECK-LABEL: luti6_f16_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: luti6 z0.h, { z0.h, z1.h }, z2[0] +; CHECK-NEXT: ret + <vscale x 8 x half> %b, + <vscale x 16 x i8> %idx) { + %res = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16( + <vscale x 8 x half> %a, <vscale x 8 x half> %b, + <vscale x 16 x i8> %idx, i32 0) + ret <vscale x 8 x half> %res +} + +declare <vscale x 16 x i8> @llvm.aarch64.sve.luti6( + <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) +declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16( + <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 16 x i8>, i32) +declare <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16( + <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 16 x i8>, i32) diff --git a/llvm/test/Verifier/AArch64/luti6-intrinsics.ll b/llvm/test/Verifier/AArch64/luti6-intrinsics.ll new file mode 100644 index 0000000000000..0777c1db532b1 --- /dev/null +++ b/llvm/test/Verifier/AArch64/luti6-intrinsics.ll @@ -0,0 +1,79 @@ +; RUN: not opt -S -passes=verify < %s 2>&1 | FileCheck %s + +define <vscale x 8 x i16> @bad_sve_luti6_ret(<vscale x 16 x i8> %a, + <vscale x 16 x i8> %b, + <vscale x 16 x i8> %idx) { +; CHECK: Intrinsic has incorrect return type! + %res = call <vscale x 8 x i16> @llvm.aarch64.sve.luti6( + <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %idx) + ret <vscale x 8 x i16> %res +} + +define <vscale x 8 x i16> @bad_sve_luti6_lane_x2_arg(<vscale x 4 x i32> %a, + <vscale x 8 x i16> %b, + <vscale x 16 x i8> %idx) { +; CHECK: Intrinsic has incorrect argument type! + %res = call <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16( + <vscale x 4 x i32> %a, <vscale x 8 x i16> %b, + <vscale x 16 x i8> %idx, i32 1) + ret <vscale x 8 x i16> %res +} + +define <vscale x 8 x half> @bad_sve_luti6_lane_x2_f16_arg( + <vscale x 8 x i16> %a, <vscale x 8 x half> %b, <vscale x 16 x i8> %idx) { +; CHECK: Intrinsic has incorrect argument type! + %res = call <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16( + <vscale x 8 x i16> %a, <vscale x 8 x half> %b, + <vscale x 16 x i8> %idx, i32 1) + ret <vscale x 8 x half> %res +} + +define <vscale x 8 x i16> @bad_sme_luti6_zt_ret(i32 %zt, + <vscale x 16 x i8> %idx) { +; CHECK: Intrinsic has incorrect return type! + %res = call <vscale x 8 x i16> @llvm.aarch64.sme.luti6.zt( + i32 %zt, <vscale x 16 x i8> %idx) + ret <vscale x 8 x i16> %res +} + +define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, + <vscale x 8 x i16> } @bad_sme_luti6_zt_x4_ret(i32 %zt, + <vscale x 16 x i8> %a, + <vscale x 16 x i8> %b, + <vscale x 16 x i8> %c) { +; CHECK: Intrinsic has incorrect return type! + %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, + <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.zt.x4( + i32 %zt, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, + <vscale x 16 x i8> %c) + ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, + <vscale x 8 x i16> } %res +} + +define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, + <vscale x 8 x i16> } @bad_sme_luti6_lane_x4_arg( + <vscale x 8 x half> %a, <vscale x 8 x i16> %b, + <vscale x 16 x i8> %x, <vscale x 16 x i8> %y) { +; CHECK: Intrinsic has incorrect argument type! + %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, + <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16( + <vscale x 8 x half> %a, <vscale x 8 x i16> %b, + <vscale x 16 x i8> %x, <vscale x 16 x i8> %y, i32 1) + ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, + <vscale x 8 x i16> } %res +} + +declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6( + <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) +declare <vscale x 8 x i16> @llvm.aarch64.sve.luti6.lane.x2.i16( + <vscale x 4 x i32>, <vscale x 8 x i16>, <vscale x 16 x i8>, i32) +declare <vscale x 8 x half> @llvm.aarch64.sve.luti6.lane.x2.f16( + <vscale x 8 x i16>, <vscale x 8 x half>, <vscale x 16 x i8>, i32) +declare <vscale x 8 x i16> @llvm.aarch64.sme.luti6.zt(i32, <vscale x 16 x i8>) +declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, + <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.zt.x4( + i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) +declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, + <vscale x 8 x i16> } @llvm.aarch64.sme.luti6.lane.x4.nxv8i16( + <vscale x 8 x half>, <vscale x 8 x i16>, <vscale x 16 x i8>, + <vscale x 16 x i8>, i32) >From a31514483fe75b42bbf5677c443a678ee55642c8 Mon Sep 17 00:00:00 2001 From: Jonathan Thackray <[email protected]> Date: Thu, 19 Mar 2026 16:39:11 +0000 Subject: [PATCH 2/2] fixup! Address PR comments --- clang/lib/Basic/Targets/AArch64.cpp | 29 ------------------ clang/lib/Basic/Targets/AArch64.h | 2 -- .../Preprocessor/aarch64-target-features.c | 23 -------------- .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 5 +--- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 7 +---- llvm/lib/Target/AArch64/SMEInstrFormats.td | 22 ++++++++------ llvm/lib/Target/AArch64/SVEInstrFormats.td | 30 +++++++++++-------- 7 files changed, 33 insertions(+), 85 deletions(-) diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index 9d4085e5ebbf4..f7ed15be75cd8 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -495,9 +495,6 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts, if (HasSVE2p1) Builder.defineMacro("__ARM_FEATURE_SVE2p1", "1"); - if (HasSVE2p3) - Builder.defineMacro("__ARM_FEATURE_SVE2p3", "1"); - if (HasSVE2 && HasSVEAES) Builder.defineMacro("__ARM_FEATURE_SVE2_AES", "1"); @@ -524,9 +521,6 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts, if (HasSME2p1) Builder.defineMacro("__ARM_FEATURE_SME2p1", "1"); - if (HasSME2p3) - Builder.defineMacro("__ARM_FEATURE_SME2p3", "1"); - if (HasSMEF16F16) Builder.defineMacro("__ARM_FEATURE_SME_F16F16", "1"); @@ -906,11 +900,9 @@ void AArch64TargetInfo::computeFeatureLookup() { .Case("sve2-sha3", FPU & SveMode && HasSVE2SHA3) .Case("sve2-sm4", FPU & SveMode && HasSVE2SM4) .Case("sve2p1", FPU & SveMode && HasSVE2p1) - .Case("sve2p3", FPU & SveMode && HasSVE2p3) .Case("sme", HasSME) .Case("sme2", HasSME2) .Case("sme2p1", HasSME2p1) - .Case("sme2p3", HasSME2p3) .Case("sme-f64f64", HasSMEF64F64) .Case("sme-i16i64", HasSMEI16I64) .Case("sme-fa64", HasSMEFA64) @@ -1016,15 +1008,6 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features, HasSVE2 = true; HasSVE2p1 = true; } - if (Feature == "+sve2p3") { - FPU |= NeonMode; - FPU |= SveMode; - HasFullFP16 = true; - HasSVE2 = true; - HasSVE2p1 = true; - HasSVE2p2 = true; - HasSVE2p3 = true; - } if (Feature == "+sve-aes") { FPU |= NeonMode; HasFullFP16 = true; @@ -1081,18 +1064,6 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features, HasBFloat16 = true; HasFullFP16 = true; } - if (Feature == "+sme2p3") { - HasSME = true; - HasSME2 = true; - HasSVE2 = true; - HasSVE2p1 = true; - HasSVE2p2 = true; - HasSME2p1 = true; - HasSME2p2 = true; - HasSME2p3 = true; - HasBFloat16 = true; - HasFullFP16 = true; - } if (Feature == "+sme-f64f64") { HasSME = true; HasSMEF64F64 = true; diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h index 4191070a4786d..f9dffed8769ef 100644 --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -85,7 +85,6 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo { bool HasBFloat16 = false; bool HasSVE2 = false; bool HasSVE2p1 = false; - bool HasSVE2p3 = false; bool HasSVEAES = false; bool HasSVE2SHA3 = false; bool HasSVE2SM4 = false; @@ -111,7 +110,6 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo { bool HasSMEF16F16 = false; bool HasSMEB16B16 = false; bool HasSME2p1 = false; - bool HasSME2p3 = false; bool HasFP8 = false; bool HasFP8FMA = false; bool HasFP8DOT2 = false; diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c index 6316b25befed8..60ddaad639d48 100644 --- a/clang/test/Preprocessor/aarch64-target-features.c +++ b/clang/test/Preprocessor/aarch64-target-features.c @@ -827,32 +827,9 @@ // CHECK-SVE2p2: __ARM_NEON_FP 0xE // CHECK-SVE2p2: __ARM_NEON_SVE_BRIDGE 1 // -// RUN: %clang -target aarch64-none-linux-gnu -march=armv9.7-a+sve2p3 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2p3 %s -// CHECK-SVE2p3: __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1 -// CHECK-SVE2p3: __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1 -// CHECK-SVE2p3: __ARM_FEATURE_SVE 1 -// CHECK-SVE2p3: __ARM_FEATURE_SVE2 1 -// CHECK-SVE2p3: __ARM_FEATURE_SVE2p1 1 -// CHECK-SVE2p3: __ARM_FEATURE_SVE2p2 1 -// CHECK-SVE2p3: __ARM_FEATURE_SVE2p3 1 -// CHECK-SVE2p3: __ARM_NEON 1 -// CHECK-SVE2p3: __ARM_NEON_FP 0xE -// CHECK-SVE2p3: __ARM_NEON_SVE_BRIDGE 1 -// CHECK-SVE2p3-NOT: __ARM_FEATURE_SME2p3 1 -// // RUN: %clang --target=aarch64 -march=armv9-a+sme2p2 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME2p2 %s // CHECK-SME2p2: __ARM_FEATURE_LOCALLY_STREAMING 1 // CHECK-SME2p2: __ARM_FEATURE_SME 1 // CHECK-SME2p2: __ARM_FEATURE_SME2 1 // CHECK-SME2p2: __ARM_FEATURE_SME2p1 1 // CHECK-SME2p2: __ARM_FEATURE_SME2p2 1 -// -// RUN: %clang --target=aarch64 -march=armv9.7-a+sme2p3 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME2p3 %s -// CHECK-SME2p3: __ARM_FEATURE_LOCALLY_STREAMING 1 -// CHECK-SME2p3: __ARM_FEATURE_SME 1 -// CHECK-SME2p3: __ARM_FEATURE_SME2 1 -// CHECK-SME2p3: __ARM_FEATURE_SME2p1 1 -// CHECK-SME2p3: __ARM_FEATURE_SME2p2 1 -// CHECK-SME2p3: __ARM_FEATURE_SME2p3 1 -// CHECK-SME2p3: __ARM_FEATURE_SVE2p1 1 -// CHECK-SME2p3: __ARM_FEATURE_SVE2p2 1 diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index c5d1fedcbc9ae..35a157aeb07cb 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -1185,10 +1185,7 @@ let Predicates = [HasSME_MOP4, HasSMEF64F64] in { // SME2.3 instructions //===----------------------------------------------------------------------===// let Predicates = [HasSME2p3] in { - def LUTI6_ZTZ : sme2_lut_single<"luti6">; - def : Pat<(nxv16i8 (int_aarch64_sme_luti6_zt (imm_to_zt untyped:$zt), - nxv16i8:$zn)), - (LUTI6_ZTZ $zt, nxv16i8:$zn)>; + defm LUTI6_ZTZ : sme2_lut_single<"luti6", int_aarch64_sme_luti6_zt>; def LUTI6_4ZT3Z : sme2_luti6_zt_consecutive<"luti6">; def LUTI6_S_4ZT3Z : sme2_luti6_zt_strided<"luti6">; def LUTI6_4Z2Z2ZI : sme2_luti6_vector_vg4_consecutive<"luti6">; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 248b11332e2a4..d664e66477f92 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -4847,12 +4847,7 @@ let Predicates = [HasNonStreamingSVE2p3_or_SME2p3] in { // SVE2.3 instructions //===----------------------------------------------------------------------===// let Predicates = [HasNonStreamingSVE2p3] in { - def LUTI6_Z2ZZ : sve2_luti6_vector<"luti6">; - def : Pat<(nxv16i8 (int_aarch64_sve_luti6 nxv16i8:$Op1, nxv16i8:$Op2, - nxv16i8:$Op3)), - (LUTI6_Z2ZZ (REG_SEQUENCE ZPR2, nxv16i8:$Op1, zsub0, - nxv16i8:$Op2, zsub1), - nxv16i8:$Op3)>; + defm LUTI6_Z2ZZ : sve2_luti6_vector<"luti6", int_aarch64_sve_luti6>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 99836aeed7c0a..e17b1b2e6c7a5 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -3921,15 +3921,19 @@ multiclass sme2_luti4_vector_vg4_index<string mnemonic> { } // 8-bit Look up table -class sme2_lut_single<string asm> - : I<(outs ZPR8:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn), - asm, "\t$Zd, $ZTt, $Zn", "", []>, Sched<[]> { - bits<0> ZTt; - bits<5> Zd; - bits<5> Zn; - let Inst{31-10} = 0b1100000011001000010000; - let Inst{9-5} = Zn; - let Inst{4-0} = Zd; +multiclass sme2_lut_single<string asm, SDPatternOperator intrinsic> { + def NAME : I<(outs ZPR8:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn), + asm, "\t$Zd, $ZTt, $Zn", "", []>, Sched<[]> { + bits<0> ZTt; + bits<5> Zd; + bits<5> Zn; + let Inst{31-10} = 0b1100000011001000010000; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + } + + def : Pat<(nxv16i8 (intrinsic (imm_to_zt untyped:$zt), nxv16i8:$zn)), + (!cast<Instruction>(NAME) $zt, nxv16i8:$zn)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index ddec0a21b0ccb..a123265ba90aa 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -11372,18 +11372,24 @@ multiclass sve2_luti6_vector_index<string mnemonic> { } // Look up table -class sve2_luti6_vector<string mnemonic> - : I<(outs ZPR8:$Zd), (ins ZZ_b:$Zn, ZPRAny:$Zm), - mnemonic, "\t$Zd, $Zn, $Zm", - "", []>, Sched<[]> { - bits<5> Zd; - bits<5> Zn; - bits<5> Zm; - let Inst{31-21} = 0b01000101001; - let Inst{20-16} = Zm; - let Inst{15-10} = 0b101011; - let Inst{9-5} = Zn; - let Inst{4-0} = Zd; +multiclass sve2_luti6_vector<string mnemonic, SDPatternOperator intrinsic> { + def NAME : I<(outs ZPR8:$Zd), (ins ZZ_b:$Zn, ZPRAny:$Zm), + mnemonic, "\t$Zd, $Zn, $Zm", + "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> Zm; + let Inst{31-21} = 0b01000101001; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b101011; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + } + + def : Pat<(nxv16i8 (intrinsic nxv16i8:$Op1, nxv16i8:$Op2, nxv16i8:$Op3)), + (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2, nxv16i8:$Op1, zsub0, + nxv16i8:$Op2, zsub1), + nxv16i8:$Op3)>; } //===----------------------------------------------------------------------===// _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
