<saurabh....@arm.com> writes: > The AArch64 FEAT_FP8 extension introduces instructions for conversion > and scaling. > > This patch introduces the following intrinsics: > 1. vcvt{1|2}_{bf16|high_bf16|low_bf16}_mf8_fpm. > 2. vcvt{q}_mf8_f16_fpm. > 3. vcvt_{high}_mf8_f32_fpm. > 4. vscale{q}_{f16|f32|f64}. > > We introduced two aarch64_builtin_signatures enum variants, unary and > ternary, and added support for these variants in the functions > aarch64_fntype and aarch64_expand_pragma_builtin. > > We added new simd_types for integers (s32, s32q, and s64q) and for > floating points (f8 and f8q). > > Because we added support for fp8 intrinsics here, we modified the check > in acle/fp8.c that was checking that __ARM_FEATURE_FP8 macro is not > defined.
Since Saurabh is currently on holiday, I've done a review in the form of a patch. The main changes are: * Rebase on top of the committed FEAT_LUT work. * Add USES_FPMR to the existing flags, rather than treating it as a separate boolean. * Automatically add the fpmr argument to the type signature, based on USES_FPMR * Represent the highpart operations using a combination of generic RTL and the corresponding lowpart operation. This should allow more optimisation, though it's difficult to test without later patches. * Use a generic "insn" int attribute for mnemonics, rather than individual per-instruction attributes. * Use "0" constraints for inputs that are tied to outputs. * Add tests that __ARM_FEATURE_FP8 is defined. Tested on aarch64-linux-gnu. I'll commit in about 24 hours or so if there are no comments before then, but please let me know if you'd like more time. Thanks, Richard gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (FLAG_USES_FPMR, FLAG_FP8): New flags. (ENTRY): Modified to support ternary operations. (enum class): New variants to support new signatures. (struct aarch64_pragma_builtins_data): Extend types to 4 elements. (aarch64_fntype): Handle new signatures. (aarch64_get_low_unspec): New function. (aarch64_convert_to_v64): New function, split out from... (aarch64_expand_pragma_builtin): ...here. Handle new signatures. * config/aarch64/aarch64-c.cc (aarch64_update_cpp_builtins): New flag for FP8. * config/aarch64/aarch64-simd-pragma-builtins.def: Define new fp8 intrinsics. (ENTRY_BINARY, ENTRY_BINARY_LANE): Update for new ENTRY interface. (ENTRY_UNARY, ENTRY_TERNARY, ENTRY_UNARY_FPM): New macros. (ENTRY_BINARY_VHSDF_SIGNED): Likewise. * config/aarch64/aarch64-simd.md (@aarch64_<fpm_uns_op><mode>): New pattern. (@aarch64_<fpm_uns_op><mode>_high): Likewise. (@aarch64_<fpm_uns_op><mode>_high_be): Likewise. (@aarch64_<fpm_uns_op><mode>_high_le): Likewise. * config/aarch64/iterators.md (V4SF_ONLY, VQ_BHF): New mode iterators. (UNSPEC_FCVTN_FP8, UNSPEC_FCVTN2_FP8, UNSPEC_F1CVTL_FP8) (UNSPEC_F1CVTL2_FP8, UNSPEC_F2CVTL_FP8, UNSPEC_F2CVTL2_FP8) (UNSPEC_FSCALE): New unspecs. (VPACKB, VPACKBtype): New mode attributes. (b): Add support for V[48][BH]F. (FPM_UNARY_UNS, FPM_BINARY_UNS, SCALE_UNS): New int iterators. (insn): New int attribute. gcc/testsuite/ChangeLog: * gcc.target/aarch64/acle/fp8.c: Remove check that fp8 feature macro doesn't exist and... * gcc.target/aarch64/pragma_cpp_predefs_4.c: ...test that it does here. * gcc.target/aarch64/simd/scale_fpm.c: New test. * gcc.target/aarch64/simd/vcvt_fpm.c: New test. Co-authored-by: Richard Sandiford <richard.sandif...@arm.com> --- gcc/config/aarch64/aarch64-builtins.cc | 128 ++++++++++-- gcc/config/aarch64/aarch64-c.cc | 2 + .../aarch64/aarch64-simd-pragma-builtins.def | 47 ++++- gcc/config/aarch64/aarch64-simd.md | 73 +++++++ gcc/config/aarch64/iterators.md | 37 +++- gcc/testsuite/gcc.target/aarch64/acle/fp8.c | 10 - .../gcc.target/aarch64/pragma_cpp_predefs_4.c | 10 + .../gcc.target/aarch64/simd/scale_fpm.c | 60 ++++++ .../gcc.target/aarch64/simd/vcvt_fpm.c | 197 ++++++++++++++++++ 9 files changed, 536 insertions(+), 28 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index f528592a17d..39a85699e51 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -198,10 +198,11 @@ const unsigned int FLAG_RAISE_FP_EXCEPTIONS = 1U << 1; const unsigned int FLAG_READ_MEMORY = 1U << 2; const unsigned int FLAG_PREFETCH_MEMORY = 1U << 3; const unsigned int FLAG_WRITE_MEMORY = 1U << 4; +const unsigned int FLAG_USES_FPMR = 1U << 5; /* Indicates that READ_FPCR and RAISE_FP_EXCEPTIONS should be set for floating-point modes but not for integer modes. */ -const unsigned int FLAG_AUTO_FP = 1U << 5; +const unsigned int FLAG_AUTO_FP = 1U << 6; const unsigned int FLAG_QUIET = 0; const unsigned int FLAG_DEFAULT = FLAG_AUTO_FP; @@ -210,6 +211,7 @@ const unsigned int FLAG_ALL = FLAG_READ_FPCR | FLAG_RAISE_FP_EXCEPTIONS | FLAG_READ_MEMORY | FLAG_PREFETCH_MEMORY | FLAG_WRITE_MEMORY; const unsigned int FLAG_STORE = FLAG_WRITE_MEMORY; const unsigned int FLAG_LOAD = FLAG_READ_MEMORY; +const unsigned int FLAG_FP8 = FLAG_FP | FLAG_USES_FPMR; typedef struct { @@ -783,7 +785,7 @@ typedef struct AARCH64_SIMD_BUILTIN_##T##_##N##A, #undef ENTRY -#define ENTRY(N, S, T0, T1, T2, U, F) \ +#define ENTRY(N, S, T0, T1, T2, T3, U, F) \ AARCH64_##N, enum aarch64_builtins @@ -1604,6 +1606,8 @@ enum class aarch64_builtin_signatures { binary, binary_lane, + ternary, + unary, }; namespace { @@ -1618,6 +1622,8 @@ struct simd_type { }; namespace simd_types { + constexpr simd_type f8 { V8QImode, qualifier_modal_float }; + constexpr simd_type f8q { V16QImode, qualifier_modal_float }; constexpr simd_type p8 { V8QImode, qualifier_poly }; constexpr simd_type p8q { V16QImode, qualifier_poly }; constexpr simd_type s8 { V8QImode, qualifier_none }; @@ -1644,7 +1650,11 @@ namespace simd_types { constexpr simd_type f32 { V2SFmode, qualifier_none }; constexpr simd_type f32q { V4SFmode, qualifier_none }; + constexpr simd_type s32 { V2SImode, qualifier_none }; + constexpr simd_type s32q { V4SImode, qualifier_none }; + constexpr simd_type f64q { V2DFmode, qualifier_none }; + constexpr simd_type s64q { V2DImode, qualifier_none }; constexpr simd_type none { VOIDmode, qualifier_none }; } @@ -1652,10 +1662,10 @@ namespace simd_types { } #undef ENTRY -#define ENTRY(N, S, T0, T1, T2, U, F) \ +#define ENTRY(N, S, T0, T1, T2, T3, U, F) \ {#N, aarch64_builtin_signatures::S, simd_types::T0, simd_types::T1, \ - simd_types::T2, U, aarch64_required_extensions::REQUIRED_EXTENSIONS, \ - FLAG_##F}, + simd_types::T2, simd_types::T3, U, \ + aarch64_required_extensions::REQUIRED_EXTENSIONS, FLAG_##F}, /* Initialize pragma builtins. */ @@ -1663,7 +1673,7 @@ struct aarch64_pragma_builtins_data { const char *name; aarch64_builtin_signatures signature; - simd_type types[3]; + simd_type types[4]; int unspec; aarch64_required_extensions required_extensions; unsigned int flags; @@ -1687,6 +1697,17 @@ aarch64_fntype (const aarch64_pragma_builtins_data &builtin_data) for (int i = 1; i <= 2; ++i) arg_types.quick_push (builtin_data.types[i].type ()); break; + + case aarch64_builtin_signatures::ternary: + return_type = builtin_data.types[0].type (); + for (int i = 1; i <= 3; ++i) + arg_types.quick_push (builtin_data.types[i].type ()); + break; + + case aarch64_builtin_signatures::unary: + return_type = builtin_data.types[0].type (); + arg_types.quick_push (builtin_data.types[1].type ()); + break; } switch (builtin_data.signature) { @@ -1697,6 +1718,8 @@ aarch64_fntype (const aarch64_pragma_builtins_data &builtin_data) default: break; } + if (builtin_data.flags & FLAG_USES_FPMR) + arg_types.quick_push (uint64_type_node); return build_function_type_array (return_type, arg_types.length (), arg_types.address ()); } @@ -3538,6 +3561,36 @@ aarch64_expand_builtin_data_intrinsic (unsigned int fcode, tree exp, rtx target) return ops[0].value; } +/* If OP is a 128-bit vector, convert it to the equivalent 64-bit vector. + Do nothing otherwise. */ +static void +aarch64_convert_to_v64 (expand_operand *op) +{ + if (known_eq (GET_MODE_BITSIZE (op->mode), 128u)) + { + op->mode = aarch64_v64_mode (GET_MODE_INNER (op->mode)).require (); + op->value = gen_lowpart (op->mode, op->value); + } +} + +/* UNSPEC is a high unspec, indicated by "2" in mnemonics and "_high" in + intrinsic names. Return the equivalent low unspec. */ +static int +aarch64_get_low_unspec (int unspec) +{ + switch (unspec) + { + case UNSPEC_FCVTN2_FP8: + return UNSPEC_FCVTN_FP8; + case UNSPEC_F1CVTL2_FP8: + return UNSPEC_F1CVTL_FP8; + case UNSPEC_F2CVTL2_FP8: + return UNSPEC_F2CVTL_FP8; + default: + gcc_unreachable (); + } +} + /* Expand CALL_EXPR EXP, given that it is a call to the function described by BUILTIN_DATA, and return the function's return value. Put the result in TARGET if convenient. */ @@ -3557,14 +3610,28 @@ aarch64_expand_pragma_builtin (tree exp, rtx target, TYPE_MODE (TREE_TYPE (arg))); } - /* LUTI2 treats the first argument as a vector of 4 elements. The forms - with 128-bit inputs are only provided as a convenience; the upper halves - don't actually matter. */ - if (builtin_data.unspec == UNSPEC_LUTI2 - && known_eq (GET_MODE_BITSIZE (ops[1].mode), 128u)) + if (builtin_data.flags & FLAG_USES_FPMR) + { + auto fpm_input = ops.pop ().value; + auto fpmr = gen_rtx_REG (DImode, FPM_REGNUM); + emit_move_insn (fpmr, fpm_input); + } + + switch (builtin_data.unspec) { - ops[1].mode = aarch64_v64_mode (GET_MODE_INNER (ops[1].mode)).require (); - ops[1].value = gen_lowpart (ops[1].mode, ops[1].value); + case UNSPEC_F1CVTL_FP8: + case UNSPEC_F2CVTL_FP8: + /* Convert _low forms (which take 128-bit vectors) to the base + 64-bit forms. */ + aarch64_convert_to_v64 (&ops[1]); + break; + + case UNSPEC_LUTI2: + /* LUTI2 treats the first argument as a vector of 4 elements. The forms + with 128-bit inputs are only provided as a convenience; the upper + halves don't actually matter. */ + aarch64_convert_to_v64 (&ops[1]); + break; } insn_code icode; @@ -3572,10 +3639,41 @@ aarch64_expand_pragma_builtin (tree exp, rtx target, { case UNSPEC_FAMAX: case UNSPEC_FAMIN: - icode = code_for_aarch64 (builtin_data.unspec, - builtin_data.types[0].mode); + case UNSPEC_F1CVTL_FP8: + case UNSPEC_F2CVTL_FP8: + case UNSPEC_FSCALE: + icode = code_for_aarch64 (builtin_data.unspec, ops[0].mode); + break; + + case UNSPEC_F1CVTL2_FP8: + case UNSPEC_F2CVTL2_FP8: + { + /* Add a high-part selector for the vec_merge. */ + auto src_mode = ops.last ().mode; + auto nunits = GET_MODE_NUNITS (src_mode).to_constant (); + rtx par = aarch64_simd_vect_par_cnst_half (src_mode, nunits, true); + create_fixed_operand (ops.safe_push ({}), par); + + auto unspec = aarch64_get_low_unspec (builtin_data.unspec); + icode = code_for_aarch64_high (unspec, ops[0].mode); + break; + } + + case UNSPEC_FCVTN_FP8: + icode = code_for_aarch64 (builtin_data.unspec, ops[1].mode); break; + case UNSPEC_FCVTN2_FP8: + { + auto unspec = aarch64_get_low_unspec (builtin_data.unspec); + auto mode = ops.last ().mode; + if (BYTES_BIG_ENDIAN) + icode = code_for_aarch64_high_be (unspec, mode); + else + icode = code_for_aarch64_high_le (unspec, mode); + break; + } + case UNSPEC_LUTI2: case UNSPEC_LUTI4: create_integer_operand (ops.safe_push ({}), diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc index dba103a7fb1..ae255889f5e 100644 --- a/gcc/config/aarch64/aarch64-c.cc +++ b/gcc/config/aarch64/aarch64-c.cc @@ -268,6 +268,8 @@ aarch64_update_cpp_builtins (cpp_reader *pfile) aarch64_def_or_undef (TARGET_SVE_BF16, "__ARM_FEATURE_SVE_BF16", pfile); + aarch64_def_or_undef (TARGET_FP8, "__ARM_FEATURE_FP8", pfile); + aarch64_def_or_undef (TARGET_LS64, "__ARM_FEATURE_LS64", pfile); aarch64_def_or_undef (TARGET_RCPC, "__ARM_FEATURE_RCPC", pfile); diff --git a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def index bc9a63b968a..6221652b38f 100644 --- a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def @@ -20,11 +20,19 @@ #undef ENTRY_BINARY #define ENTRY_BINARY(N, T0, T1, T2, U, F) \ - ENTRY (N, binary, T0, T1, T2, U, F) + ENTRY (N, binary, T0, T1, T2, none, U, F) #undef ENTRY_BINARY_LANE #define ENTRY_BINARY_LANE(N, T0, T1, T2, U, F) \ - ENTRY (N, binary_lane, T0, T1, T2, U, F) + ENTRY (N, binary_lane, T0, T1, T2, none, U, F) + +#undef ENTRY_TERNARY +#define ENTRY_TERNARY(N, T0, T1, T2, T3, U, F) \ + ENTRY (N, ternary, T0, T1, T2, T3, U, F) + +#undef ENTRY_UNARY +#define ENTRY_UNARY(N, T0, T1, U, F) \ + ENTRY (N, unary, T0, T1, none, none, U, F) #undef ENTRY_BINARY_VHSDF #define ENTRY_BINARY_VHSDF(NAME, UNSPEC, FLAGS) \ @@ -34,6 +42,14 @@ ENTRY_BINARY (NAME##q_f32, f32q, f32q, f32q, UNSPEC, FLAGS) \ ENTRY_BINARY (NAME##q_f64, f64q, f64q, f64q, UNSPEC, FLAGS) +#undef ENTRY_BINARY_VHSDF_SIGNED +#define ENTRY_BINARY_VHSDF_SIGNED(NAME, UNSPEC, FLAGS) \ + ENTRY_BINARY (NAME##_f16, f16, f16, s16, UNSPEC, FLAGS) \ + ENTRY_BINARY (NAME##q_f16, f16q, f16q, s16q, UNSPEC, FLAGS) \ + ENTRY_BINARY (NAME##_f32, f32, f32, s32, UNSPEC, FLAGS) \ + ENTRY_BINARY (NAME##q_f32, f32q, f32q, s32q, UNSPEC, FLAGS) \ + ENTRY_BINARY (NAME##q_f64, f64q, f64q, s64q, UNSPEC, FLAGS) + #undef ENTRY_TERNARY_VLUT8 #define ENTRY_TERNARY_VLUT8(T) \ ENTRY_BINARY_LANE (vluti2_lane_##T##8, T##8q, T##8, u8, \ @@ -64,6 +80,11 @@ ENTRY_BINARY_LANE (vluti4q_laneq_##T##16_x2, T##16q, T##16qx2, u8q, \ UNSPEC_LUTI4, QUIET) +#undef ENTRY_UNARY_VQ_BHF +#define ENTRY_UNARY_VQ_BHF(N, T1, UNSPEC, FLAGS) \ + ENTRY_UNARY (N##_bf16_mf8_fpm, bf16q, T1, UNSPEC, FLAGS) \ + ENTRY_UNARY (N##_f16_mf8_fpm, f16q, T1, UNSPEC, FLAGS) + // faminmax #define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FAMINMAX) ENTRY_BINARY_VHSDF (vamax, UNSPEC_FAMAX, FP) @@ -82,3 +103,25 @@ ENTRY_TERNARY_VLUT16 (p) ENTRY_TERNARY_VLUT16 (s) ENTRY_TERNARY_VLUT16 (u) #undef REQUIRED_EXTENSIONS + +// fpm conversion +#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8) +ENTRY_UNARY_VQ_BHF (vcvt1, f8, UNSPEC_F1CVTL_FP8, FP8) +ENTRY_UNARY_VQ_BHF (vcvt1_high, f8q, UNSPEC_F1CVTL2_FP8, FP8) +ENTRY_UNARY_VQ_BHF (vcvt1_low, f8q, UNSPEC_F1CVTL_FP8, FP8) +ENTRY_UNARY_VQ_BHF (vcvt2, f8, UNSPEC_F2CVTL_FP8, FP8) +ENTRY_UNARY_VQ_BHF (vcvt2_high, f8q, UNSPEC_F2CVTL2_FP8, FP8) +ENTRY_UNARY_VQ_BHF (vcvt2_low, f8q, UNSPEC_F2CVTL_FP8, FP8) + +ENTRY_BINARY (vcvt_mf8_f16_fpm, f8, f16, f16, UNSPEC_FCVTN_FP8, FP8) +ENTRY_BINARY (vcvtq_mf8_f16_fpm, f8q, f16q, f16q, UNSPEC_FCVTN_FP8, FP8) +ENTRY_BINARY (vcvt_mf8_f32_fpm, f8, f32q, f32q, UNSPEC_FCVTN_FP8, FP8) + +ENTRY_TERNARY (vcvt_high_mf8_f32_fpm, f8q, f8, f32q, f32q, + UNSPEC_FCVTN2_FP8, FP8) +#undef REQUIRED_EXTENSIONS + +// fpm scaling +#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8) +ENTRY_BINARY_VHSDF_SIGNED (vscale, UNSPEC_FSCALE, FP) +#undef REQUIRED_EXTENSIONS diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 05cbd38372d..f38bad72781 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -10024,3 +10024,76 @@ (define_insn "@aarch64_lut<VLUTx2:mode><VB:mode>" "TARGET_LUT && INTVAL (operands[4]) == 4" "luti%4\t%0.8h, {%S1.8h, %T1.8h}, %2[%3]" ) + +;; fpm unary instructions (low part). +(define_insn "@aarch64_<insn><mode>" + [(set (match_operand:VQ_BHF 0 "register_operand" "=w") + (unspec:VQ_BHF + [(match_operand:V8QI 1 "register_operand" "w") + (reg:DI FPM_REGNUM)] + FPM_UNARY_UNS))] + "TARGET_FP8" + "<b><insn>\t%0.<Vtype>, %1.8b" +) + +;; fpm unary instructions (high part). +(define_insn "@aarch64_<insn><mode>_high" + [(set (match_operand:VQ_BHF 0 "register_operand" "=w") + (unspec:VQ_BHF + [(vec_select:V8QI + (match_operand:V16QI 1 "register_operand" "w") + (match_operand:V16QI 2 "vect_par_cnst_hi_half")) + (reg:DI FPM_REGNUM)] + FPM_UNARY_UNS))] + "TARGET_FP8" + "<b><insn>2\t%0.<Vtype>, %1.16b" +) + +;; fpm binary instructions. +(define_insn "@aarch64_<insn><mode>" + [(set (match_operand:<VPACKB> 0 "register_operand" "=w") + (unspec:<VPACKB> + [(match_operand:VCVTFPM 1 "register_operand" "w") + (match_operand:VCVTFPM 2 "register_operand" "w") + (reg:DI FPM_REGNUM)] + FPM_BINARY_UNS))] + "TARGET_FP8" + "<insn>\t%0.<VPACKBtype>, %1.<Vtype>, %2.<Vtype>" +) + +;; fpm binary instructions & merge with low. +(define_insn "@aarch64_<insn><mode>_high_le" + [(set (match_operand:V16QI 0 "register_operand" "=w") + (vec_concat:V16QI + (match_operand:V8QI 1 "register_operand" "0") + (unspec:V8QI + [(match_operand:V4SF_ONLY 2 "register_operand" "w") + (match_operand:V4SF_ONLY 3 "register_operand" "w") + (reg:DI FPM_REGNUM)] + FPM_BINARY_UNS)))] + "TARGET_FP8 && !BYTES_BIG_ENDIAN" + "<insn>2\t%1.16b, %2.<V4SF_ONLY:Vtype>, %3.<V4SF_ONLY:Vtype>"; +) + +(define_insn "@aarch64_<insn><mode>_high_be" + [(set (match_operand:V16QI 0 "register_operand" "=w") + (vec_concat:V16QI + (unspec:V8QI + [(match_operand:V4SF_ONLY 2 "register_operand" "w") + (match_operand:V4SF_ONLY 3 "register_operand" "w") + (reg:DI FPM_REGNUM)] + FPM_BINARY_UNS) + (match_operand:V8QI 1 "register_operand" "0")))] + "TARGET_FP8 && BYTES_BIG_ENDIAN" + "<insn>2\t%1.16b, %2.<V4SF_ONLY:Vtype>, %3.<V4SF_ONLY:Vtype>"; +) + +;; fscale instructions +(define_insn "@aarch64_<insn><mode>" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w") + (match_operand:<FCVT_TARGET> 2 "register_operand" "w")] + FSCALE_UNS))] + "TARGET_FP8" + "<insn>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>" +) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 90725c7faeb..7b426aae7a8 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -41,6 +41,7 @@ (define_mode_iterator SHORT [QI HI]) ;; Iterators for single modes, for "@" patterns. (define_mode_iterator SI_ONLY [SI]) (define_mode_iterator DI_ONLY [DI]) +(define_mode_iterator V4SF_ONLY [V4SF]) ;; Iterator for all integer modes (up to 64-bit) (define_mode_iterator ALLI [QI HI SI DI]) @@ -181,6 +182,9 @@ (define_mode_iterator VSFDF [V2SF V4SF V2DF DF SF]) ;; Advanced SIMD single Float modes. (define_mode_iterator VDQSF [V2SF V4SF]) +;; Quad vector float modes with half/bfloat elements. +(define_mode_iterator VQ_BHF [V8HF V8BF]) + ;; Quad vector Float modes with half/single elements. (define_mode_iterator VQ_HSF [V8HF V4SF]) @@ -430,6 +434,9 @@ (define_mode_iterator VMULD [V4HI V8HI V2SI V4SI (define_mode_iterator VLUT [V8QI V16QI V4HI V4HF V4BF]) (define_mode_iterator VLUTx2 [V2x8HI V2x8HF V2x8BF]) +;; Modes available for Advanced SIMD FP8 conversion operations. +(define_mode_iterator VCVTFPM [V4HF V8HF V4SF]) + ;; Iterators for single modes, for "@" patterns. (define_mode_iterator VNx16QI_ONLY [VNx16QI]) (define_mode_iterator VNx16SI_ONLY [VNx16SI]) @@ -715,6 +722,12 @@ (define_c_enum "unspec" UNSPEC_ASHIFT_SIGNED ; Used in aarch-simd.md. UNSPEC_ASHIFT_UNSIGNED ; Used in aarch64-simd.md. UNSPEC_ABS ; Used in aarch64-simd.md. + UNSPEC_FCVTN_FP8 ; Used in aarch64-simd.md. + UNSPEC_FCVTN2_FP8 ; Used in aarch64-builtins.cc. + UNSPEC_F1CVTL_FP8 ; Used in aarch64-simd.md. + UNSPEC_F1CVTL2_FP8 ; Used in aarch64-builtins.cc. + UNSPEC_F2CVTL_FP8 ; Used in aarch64-simd.md. + UNSPEC_F2CVTL2_FP8 ; Used in aarch64-builtins.cc. UNSPEC_FMAX ; Used in aarch64-simd.md. UNSPEC_FMAXNMV ; Used in aarch64-simd.md. UNSPEC_FMAXV ; Used in aarch64-simd.md. @@ -723,6 +736,7 @@ (define_c_enum "unspec" UNSPEC_FMINV ; Used in aarch64-simd.md. UNSPEC_FADDV ; Used in aarch64-simd.md. UNSPEC_FNEG ; Used in aarch64-simd.md. + UNSPEC_FSCALE ; Used in aarch64-simd.md. UNSPEC_ADDV ; Used in aarch64-simd.md. UNSPEC_SMAXV ; Used in aarch64-simd.md. UNSPEC_SMINV ; Used in aarch64-simd.md. @@ -1790,6 +1804,11 @@ (define_mode_attr Vntype [(V8HI "8b") (V4SI "4h") (define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h") (V2DI "4s")]) +;; The result of FCVTN on two vectors of the given mode. The result has +;; twice as many QI elements as the input. +(define_mode_attr VPACKB [(V4HF "V8QI") (V8HF "V16QI") (V4SF "V8QI")]) +(define_mode_attr VPACKBtype [(V4HF "8b") (V8HF "16b") (V4SF "8b")]) + ;; Widened modes of vector modes. (define_mode_attr VWIDE [(V8QI "V8HI") (V4HI "V4SI") (V2SI "V2DI") (V16QI "V8HI") @@ -2547,7 +2566,8 @@ (define_mode_attr vec_or_offset [(V8QI "vec") (V16QI "vec") (V4HI "vec") (V8HI "vec") (V2SI "vec") (V4SI "vec") (V2DI "vec") (DI "offset")]) -(define_mode_attr b [(VNx8BF "b") (VNx8HF "") (VNx4SF "") (VNx2DF "") +(define_mode_attr b [(V4BF "b") (V4HF "") (V8BF "b") (V8HF "") + (VNx8BF "b") (VNx8HF "") (VNx4SF "") (VNx2DF "") (VNx16BF "b") (VNx16HF "") (VNx8SF "") (VNx4DF "") (VNx32BF "b") (VNx32HF "") (VNx16SF "") (VNx8DF "")]) @@ -3794,10 +3814,25 @@ (define_int_iterator SVE2_FP8_TERNARY_LANE_VNX4SF UNSPEC_FMLALLTB_FP8 UNSPEC_FMLALLTT_FP8]) +;; Iterators for fpm instructions + +(define_int_iterator FPM_UNARY_UNS [UNSPEC_F1CVTL_FP8 UNSPEC_F2CVTL_FP8]) + +(define_int_iterator FPM_BINARY_UNS [UNSPEC_FCVTN_FP8]) + +(define_int_iterator FSCALE_UNS [UNSPEC_FSCALE]) + ;; ------------------------------------------------------------------- ;; Int Iterators Attributes. ;; ------------------------------------------------------------------- +;; The AArch64 insn mnemonic associated with an unspec. +(define_int_attr insn + [(UNSPEC_F1CVTL_FP8 "f1cvtl") + (UNSPEC_F2CVTL_FP8 "f2cvtl") + (UNSPEC_FCVTN_FP8 "fcvtn") + (UNSPEC_FSCALE "fscale")]) + ;; The optab associated with an operation. Note that for ANDF, IORF ;; and XORF, the optab pattern is not actually defined; we just use this ;; name for consistency with the integer patterns. diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c index afb44f83f60..635a7eaf4a2 100644 --- a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c +++ b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c @@ -5,19 +5,9 @@ #include <arm_acle.h> -#ifdef __ARM_FEATURE_FP8 -#error "__ARM_FEATURE_FP8 feature macro defined." -#endif - #pragma GCC push_options #pragma GCC target("arch=armv9.4-a+fp8") -/* We do not define __ARM_FEATURE_FP8 until all - relevant features have been added. */ -#ifdef __ARM_FEATURE_FP8 -#error "__ARM_FEATURE_FP8 feature macro defined." -#endif - /* **test_write_fpmr_sysreg_asm_64: ** msr fpmr, x0 diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c index 37bd844f581..e5a19aaefb6 100644 --- a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c +++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c @@ -263,3 +263,13 @@ #ifdef __ARM_FEATURE_GCS #error Foo #endif + +#pragma GCC target "arch=armv9-a" +#ifdef __ARM_FEATURE_FP8 +#error Foo +#endif + +#pragma GCC target "arch=armv9-a+fp8" +#ifndef __ARM_FEATURE_FP8 +#error Foo +#endif diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c b/gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c new file mode 100644 index 00000000000..d95a861fcfd --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c @@ -0,0 +1,60 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -march=armv9-a+fp8" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +/* +** test_vscale_f16: +** fscale v0.4h, v0.4h, v1.4h +** ret +*/ +float16x4_t +test_vscale_f16 (float16x4_t a, int16x4_t b) +{ + return vscale_f16 (a, b); +} + +/* +** test_vscaleq_f16: +** fscale v0.8h, v0.8h, v1.8h +** ret +*/ +float16x8_t +test_vscaleq_f16 (float16x8_t a, int16x8_t b) +{ + return vscaleq_f16 (a, b); +} + +/* +** test_vscale_f32: +** fscale v0.2s, v0.2s, v1.2s +** ret +*/ +float32x2_t +test_vscale_f32 (float32x2_t a, int32x2_t b) +{ + return vscale_f32 (a, b); +} + +/* +** test_vscaleq_f32: +** fscale v0.4s, v0.4s, v1.4s +** ret +*/ +float32x4_t +test_vscaleq_f32 (float32x4_t a, int32x4_t b) +{ + return vscaleq_f32 (a, b); +} + +/* +** test_vscaleq_f64: +** fscale v0.2d, v0.2d, v1.2d +** ret +*/ +float64x2_t +test_vscaleq_f64 (float64x2_t a, int64x2_t b) +{ + return vscaleq_f64 (a, b); +} diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c b/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c new file mode 100644 index 00000000000..39076684345 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c @@ -0,0 +1,197 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -march=armv9-a+fp8" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +/* +** test_vcvt1_bf16: +** msr fpmr, x0 +** bf1cvtl v0.8h, v0.8b +** ret +*/ +bfloat16x8_t +test_vcvt1_bf16 (mfloat8x8_t a, fpm_t b) +{ + return vcvt1_bf16_mf8_fpm(a, b); +} + +/* +** test_high_vcvt1_bf16: +** msr fpmr, x0 +** bf1cvtl2 v0.8h, v0.16b +** ret +*/ +bfloat16x8_t +test_high_vcvt1_bf16 (mfloat8x16_t a, fpm_t b) +{ + return vcvt1_high_bf16_mf8_fpm(a, b); +} + +/* +** test_low_vcvt1_bf16: +** msr fpmr, x0 +** bf1cvtl v0.8h, v0.8b +** ret +*/ +bfloat16x8_t +test_low_vcvt1_bf16 (mfloat8x16_t a, fpm_t b) +{ + return vcvt1_low_bf16_mf8_fpm(a, b); +} + +/* +** test_vcvt1_f16: +** msr fpmr, x0 +** f1cvtl v0.8h, v0.8b +** ret +*/ +float16x8_t +test_vcvt1_f16 (mfloat8x8_t a, fpm_t b) +{ + return vcvt1_f16_mf8_fpm(a, b); +} + +/* +** test_high_vcvt1_f16: +** msr fpmr, x0 +** f1cvtl2 v0.8h, v0.16b +** ret +*/ +float16x8_t +test_high_vcvt1_f16 (mfloat8x16_t a, fpm_t b) +{ + return vcvt1_high_f16_mf8_fpm(a, b); +} + +/* +** test_low_vcvt1_f16: +** msr fpmr, x0 +** f1cvtl v0.8h, v0.8b +** ret +*/ +float16x8_t +test_low_vcvt1_f16 (mfloat8x16_t a, fpm_t b) +{ + return vcvt1_low_f16_mf8_fpm(a, b); +} + +/* +** test_vcvt2_bf16: +** msr fpmr, x0 +** bf2cvtl v0.8h, v0.8b +** ret +*/ +bfloat16x8_t +test_vcvt2_bf16 (mfloat8x8_t a, fpm_t b) +{ + return vcvt2_bf16_mf8_fpm(a, b); +} + +/* +** test_high_vcvt2_bf16: +** msr fpmr, x0 +** bf2cvtl2 v0.8h, v0.16b +** ret +*/ +bfloat16x8_t +test_high_vcvt2_bf16 (mfloat8x16_t a, fpm_t b) +{ + return vcvt2_high_bf16_mf8_fpm(a, b); +} + +/* +** test_low_vcvt2_bf16: +** msr fpmr, x0 +** bf1cvtl v0.8h, v0.8b +** ret +*/ +bfloat16x8_t +test_low_vcvt2_bf16 (mfloat8x16_t a, fpm_t b) +{ + return vcvt1_low_bf16_mf8_fpm(a, b); +} + +/* +** test_vcvt2_f16: +** msr fpmr, x0 +** f2cvtl v0.8h, v0.8b +** ret +*/ +float16x8_t +test_vcvt2_f16 (mfloat8x8_t a, fpm_t b) +{ + return vcvt2_f16_mf8_fpm(a, b); +} + +/* +** test_high_vcvt2_f16: +** msr fpmr, x0 +** f2cvtl2 v0.8h, v0.16b +** ret +*/ +float16x8_t +test_high_vcvt2_f16 (mfloat8x16_t a, fpm_t b) +{ + return vcvt2_high_f16_mf8_fpm(a, b); +} + +/* +** test_low_vcvt2_f16: +** msr fpmr, x0 +** f1cvtl v0.8h, v0.8b +** ret +*/ +float16x8_t +test_low_vcvt2_f16 (mfloat8x16_t a, fpm_t b) +{ + return vcvt1_low_f16_mf8_fpm(a, b); +} + +/* +** test_vcvt_f16: +** msr fpmr, x0 +** fcvtn v0.8b, v0.4h, v1.4h +** ret +*/ +mfloat8x8_t +test_vcvt_f16 (float16x4_t a, float16x4_t b, fpm_t c) +{ + return vcvt_mf8_f16_fpm(a, b, c); +} + +/* +** test_vcvtq_f16: +** msr fpmr, x0 +** fcvtn v0.16b, v0.8h, v1.8h +** ret +*/ +mfloat8x16_t +test_vcvtq_f16 (float16x8_t a, float16x8_t b, fpm_t c) +{ + return vcvtq_mf8_f16_fpm(a, b, c); +} + +/* +** test_vcvt_f32: +** msr fpmr, x0 +** fcvtn v0.8b, v0.4s, v1.4s +** ret +*/ +mfloat8x8_t +test_vcvt_f32 (float32x4_t a, float32x4_t b, fpm_t c) +{ + return vcvt_mf8_f32_fpm(a, b, c); +} + +/* +** test_vcvt_high_f32: +** msr fpmr, x0 +** fcvtn2 v0.16b, v1.4s, v2.4s +** ret +*/ +mfloat8x16_t +test_vcvt_high_f32 (mfloat8x8_t a, float32x4_t b, float32x4_t c, fpm_t d) +{ + return vcvt_high_mf8_f32_fpm(a, b, c, d); +} -- 2.25.1