Re: [PATCH v2 1/3] aarch64: Add support for fp8 convert and scale

Richard Sandiford Mon, 09 Dec 2024 05:03:04 -0800

<saurabh....@arm.com> writes:
> The AArch64 FEAT_FP8 extension introduces instructions for conversion
> and scaling.
>
> This patch introduces the following intrinsics:
> 1. vcvt{1|2}_{bf16|high_bf16|low_bf16}_mf8_fpm.
> 2. vcvt{q}_mf8_f16_fpm.
> 3. vcvt_{high}_mf8_f32_fpm.
> 4. vscale{q}_{f16|f32|f64}.
>
> We introduced two aarch64_builtin_signatures enum variants, unary and
> ternary, and added support for these variants in the functions
> aarch64_fntype and aarch64_expand_pragma_builtin.
>
> We added new simd_types for integers (s32, s32q, and s64q) and for
> floating points (f8 and f8q).
>
> Because we added support for fp8 intrinsics here, we modified the check
> in acle/fp8.c that was checking that __ARM_FEATURE_FP8 macro is not
> defined.


Since Saurabh is currently on holiday, I've done a review in the form
of a patch.  The main changes are:

* Rebase on top of the committed FEAT_LUT work.

* Add USES_FPMR to the existing flags, rather than treating it as
  a separate boolean.

* Automatically add the fpmr argument to the type signature, based
  on USES_FPMR

* Represent the highpart operations using a combination of generic
  RTL and the corresponding lowpart operation.  This should allow more
  optimisation, though it's difficult to test without later patches.

* Use a generic "insn" int attribute for mnemonics, rather than
  individual per-instruction attributes.

* Use "0" constraints for inputs that are tied to outputs.

* Add tests that __ARM_FEATURE_FP8 is defined.

Tested on aarch64-linux-gnu.  I'll commit in about 24 hours or so
if there are no comments before then, but please let me know if you'd
like more time.

Thanks,
Richard


gcc/ChangeLog:

        * config/aarch64/aarch64-builtins.cc
        (FLAG_USES_FPMR, FLAG_FP8): New flags.
        (ENTRY): Modified to support ternary operations.
        (enum class): New variants to support new signatures.
        (struct aarch64_pragma_builtins_data): Extend types to 4 elements.
        (aarch64_fntype): Handle new signatures.
        (aarch64_get_low_unspec): New function.
        (aarch64_convert_to_v64): New function, split out from...
        (aarch64_expand_pragma_builtin): ...here.  Handle new signatures.
        * config/aarch64/aarch64-c.cc
        (aarch64_update_cpp_builtins): New flag for FP8.
        * config/aarch64/aarch64-simd-pragma-builtins.def: Define new fp8
        intrinsics.
        (ENTRY_BINARY, ENTRY_BINARY_LANE): Update for new ENTRY interface.
        (ENTRY_UNARY, ENTRY_TERNARY, ENTRY_UNARY_FPM): New macros.
        (ENTRY_BINARY_VHSDF_SIGNED): Likewise.
        * config/aarch64/aarch64-simd.md
        (@aarch64_<fpm_uns_op><mode>): New pattern.
        (@aarch64_<fpm_uns_op><mode>_high): Likewise.
        (@aarch64_<fpm_uns_op><mode>_high_be): Likewise.
        (@aarch64_<fpm_uns_op><mode>_high_le): Likewise.
        * config/aarch64/iterators.md (V4SF_ONLY, VQ_BHF): New mode iterators.
        (UNSPEC_FCVTN_FP8, UNSPEC_FCVTN2_FP8, UNSPEC_F1CVTL_FP8)
        (UNSPEC_F1CVTL2_FP8, UNSPEC_F2CVTL_FP8, UNSPEC_F2CVTL2_FP8)
        (UNSPEC_FSCALE): New unspecs.
        (VPACKB, VPACKBtype): New mode attributes.
        (b): Add support for V[48][BH]F.
        (FPM_UNARY_UNS, FPM_BINARY_UNS, SCALE_UNS): New int iterators.
        (insn): New int attribute.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/acle/fp8.c: Remove check that fp8 feature
        macro doesn't exist and...
        * gcc.target/aarch64/pragma_cpp_predefs_4.c: ...test that it does here.
        * gcc.target/aarch64/simd/scale_fpm.c: New test.
        * gcc.target/aarch64/simd/vcvt_fpm.c: New test.

Co-authored-by: Richard Sandiford <richard.sandif...@arm.com>
---
 gcc/config/aarch64/aarch64-builtins.cc        | 128 ++++++++++--
 gcc/config/aarch64/aarch64-c.cc               |   2 +
 .../aarch64/aarch64-simd-pragma-builtins.def  |  47 ++++-
 gcc/config/aarch64/aarch64-simd.md            |  73 +++++++
 gcc/config/aarch64/iterators.md               |  37 +++-
 gcc/testsuite/gcc.target/aarch64/acle/fp8.c   |  10 -
 .../gcc.target/aarch64/pragma_cpp_predefs_4.c |  10 +
 .../gcc.target/aarch64/simd/scale_fpm.c       |  60 ++++++
 .../gcc.target/aarch64/simd/vcvt_fpm.c        | 197 ++++++++++++++++++
 9 files changed, 536 insertions(+), 28 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index f528592a17d..39a85699e51 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -198,10 +198,11 @@ const unsigned int FLAG_RAISE_FP_EXCEPTIONS = 1U << 1;
 const unsigned int FLAG_READ_MEMORY = 1U << 2;
 const unsigned int FLAG_PREFETCH_MEMORY = 1U << 3;
 const unsigned int FLAG_WRITE_MEMORY = 1U << 4;
+const unsigned int FLAG_USES_FPMR = 1U << 5;
 
 /* Indicates that READ_FPCR and RAISE_FP_EXCEPTIONS should be set for
    floating-point modes but not for integer modes.  */
-const unsigned int FLAG_AUTO_FP = 1U << 5;
+const unsigned int FLAG_AUTO_FP = 1U << 6;
 
 const unsigned int FLAG_QUIET = 0;
 const unsigned int FLAG_DEFAULT = FLAG_AUTO_FP;
@@ -210,6 +211,7 @@ const unsigned int FLAG_ALL = FLAG_READ_FPCR | 
FLAG_RAISE_FP_EXCEPTIONS
   | FLAG_READ_MEMORY | FLAG_PREFETCH_MEMORY | FLAG_WRITE_MEMORY;
 const unsigned int FLAG_STORE = FLAG_WRITE_MEMORY;
 const unsigned int FLAG_LOAD = FLAG_READ_MEMORY;
+const unsigned int FLAG_FP8 = FLAG_FP | FLAG_USES_FPMR;
 
 typedef struct
 {
@@ -783,7 +785,7 @@ typedef struct
   AARCH64_SIMD_BUILTIN_##T##_##N##A,
 
 #undef ENTRY
-#define ENTRY(N, S, T0, T1, T2, U, F)          \
+#define ENTRY(N, S, T0, T1, T2, T3, U, F)      \
   AARCH64_##N,
 
 enum aarch64_builtins
@@ -1604,6 +1606,8 @@ enum class aarch64_builtin_signatures
 {
   binary,
   binary_lane,
+  ternary,
+  unary,
 };
 
 namespace {
@@ -1618,6 +1622,8 @@ struct simd_type {
 };
 
 namespace simd_types {
+  constexpr simd_type f8 { V8QImode, qualifier_modal_float };
+  constexpr simd_type f8q { V16QImode, qualifier_modal_float };
   constexpr simd_type p8 { V8QImode, qualifier_poly };
   constexpr simd_type p8q { V16QImode, qualifier_poly };
   constexpr simd_type s8 { V8QImode, qualifier_none };
@@ -1644,7 +1650,11 @@ namespace simd_types {
 
   constexpr simd_type f32 { V2SFmode, qualifier_none };
   constexpr simd_type f32q { V4SFmode, qualifier_none };
+  constexpr simd_type s32 { V2SImode, qualifier_none };
+  constexpr simd_type s32q { V4SImode, qualifier_none };
+
   constexpr simd_type f64q { V2DFmode, qualifier_none };
+  constexpr simd_type s64q { V2DImode, qualifier_none };
 
   constexpr simd_type none { VOIDmode, qualifier_none };
 }
@@ -1652,10 +1662,10 @@ namespace simd_types {
 }
 
 #undef ENTRY
-#define ENTRY(N, S, T0, T1, T2, U, F) \
+#define ENTRY(N, S, T0, T1, T2, T3, U, F) \
   {#N, aarch64_builtin_signatures::S, simd_types::T0, simd_types::T1, \
-   simd_types::T2, U, aarch64_required_extensions::REQUIRED_EXTENSIONS, \
-   FLAG_##F},
+   simd_types::T2, simd_types::T3, U, \
+   aarch64_required_extensions::REQUIRED_EXTENSIONS, FLAG_##F},
 
 /* Initialize pragma builtins.  */
 
@@ -1663,7 +1673,7 @@ struct aarch64_pragma_builtins_data
 {
   const char *name;
   aarch64_builtin_signatures signature;
-  simd_type types[3];
+  simd_type types[4];
   int unspec;
   aarch64_required_extensions required_extensions;
   unsigned int flags;
@@ -1687,6 +1697,17 @@ aarch64_fntype (const aarch64_pragma_builtins_data 
&builtin_data)
       for (int i = 1; i <= 2; ++i)
        arg_types.quick_push (builtin_data.types[i].type ());
       break;
+
+    case aarch64_builtin_signatures::ternary:
+      return_type = builtin_data.types[0].type ();
+      for (int i = 1; i <= 3; ++i)
+       arg_types.quick_push (builtin_data.types[i].type ());
+      break;
+
+    case aarch64_builtin_signatures::unary:
+      return_type = builtin_data.types[0].type ();
+      arg_types.quick_push (builtin_data.types[1].type ());
+      break;
     }
   switch (builtin_data.signature)
     {
@@ -1697,6 +1718,8 @@ aarch64_fntype (const aarch64_pragma_builtins_data 
&builtin_data)
     default:
       break;
     }
+  if (builtin_data.flags & FLAG_USES_FPMR)
+    arg_types.quick_push (uint64_type_node);
   return build_function_type_array (return_type, arg_types.length (),
                                    arg_types.address ());
 }
@@ -3538,6 +3561,36 @@ aarch64_expand_builtin_data_intrinsic (unsigned int 
fcode, tree exp, rtx target)
   return ops[0].value;
 }
 
+/* If OP is a 128-bit vector, convert it to the equivalent 64-bit vector.
+   Do nothing otherwise.  */
+static void
+aarch64_convert_to_v64 (expand_operand *op)
+{
+  if (known_eq (GET_MODE_BITSIZE (op->mode), 128u))
+    {
+      op->mode = aarch64_v64_mode (GET_MODE_INNER (op->mode)).require ();
+      op->value = gen_lowpart (op->mode, op->value);
+    }
+}
+
+/* UNSPEC is a high unspec, indicated by "2" in mnemonics and "_high" in
+   intrinsic names.  Return the equivalent low unspec.  */
+static int
+aarch64_get_low_unspec (int unspec)
+{
+  switch (unspec)
+    {
+    case UNSPEC_FCVTN2_FP8:
+      return UNSPEC_FCVTN_FP8;
+    case UNSPEC_F1CVTL2_FP8:
+      return UNSPEC_F1CVTL_FP8;
+    case UNSPEC_F2CVTL2_FP8:
+      return UNSPEC_F2CVTL_FP8;
+    default:
+      gcc_unreachable ();
+    }
+}
+
 /* Expand CALL_EXPR EXP, given that it is a call to the function described
    by BUILTIN_DATA, and return the function's return value.  Put the result
    in TARGET if convenient.  */
@@ -3557,14 +3610,28 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
                            TYPE_MODE (TREE_TYPE (arg)));
     }
 
-  /* LUTI2 treats the first argument as a vector of 4 elements.  The forms
-     with 128-bit inputs are only provided as a convenience; the upper halves
-     don't actually matter.  */
-  if (builtin_data.unspec == UNSPEC_LUTI2
-      && known_eq (GET_MODE_BITSIZE (ops[1].mode), 128u))
+  if (builtin_data.flags & FLAG_USES_FPMR)
+    {
+      auto fpm_input = ops.pop ().value;
+      auto fpmr = gen_rtx_REG (DImode, FPM_REGNUM);
+      emit_move_insn (fpmr, fpm_input);
+    }
+
+  switch (builtin_data.unspec)
     {
-      ops[1].mode = aarch64_v64_mode (GET_MODE_INNER (ops[1].mode)).require ();
-      ops[1].value = gen_lowpart (ops[1].mode, ops[1].value);
+    case UNSPEC_F1CVTL_FP8:
+    case UNSPEC_F2CVTL_FP8:
+      /* Convert _low forms (which take 128-bit vectors) to the base
+        64-bit forms.  */
+      aarch64_convert_to_v64 (&ops[1]);
+      break;
+
+    case UNSPEC_LUTI2:
+      /* LUTI2 treats the first argument as a vector of 4 elements.  The forms
+        with 128-bit inputs are only provided as a convenience; the upper
+        halves don't actually matter.  */
+      aarch64_convert_to_v64 (&ops[1]);
+      break;
     }
 
   insn_code icode;
@@ -3572,10 +3639,41 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
     {
     case UNSPEC_FAMAX:
     case UNSPEC_FAMIN:
-      icode = code_for_aarch64 (builtin_data.unspec,
-                               builtin_data.types[0].mode);
+    case UNSPEC_F1CVTL_FP8:
+    case UNSPEC_F2CVTL_FP8:
+    case UNSPEC_FSCALE:
+      icode = code_for_aarch64 (builtin_data.unspec, ops[0].mode);
+      break;
+
+    case UNSPEC_F1CVTL2_FP8:
+    case UNSPEC_F2CVTL2_FP8:
+      {
+       /* Add a high-part selector for the vec_merge.  */
+       auto src_mode = ops.last ().mode;
+       auto nunits = GET_MODE_NUNITS (src_mode).to_constant ();
+       rtx par = aarch64_simd_vect_par_cnst_half (src_mode, nunits, true);
+       create_fixed_operand (ops.safe_push ({}), par);
+
+       auto unspec = aarch64_get_low_unspec (builtin_data.unspec);
+       icode = code_for_aarch64_high (unspec, ops[0].mode);
+       break;
+      }
+
+    case UNSPEC_FCVTN_FP8:
+      icode = code_for_aarch64 (builtin_data.unspec, ops[1].mode);
       break;
 
+    case UNSPEC_FCVTN2_FP8:
+      {
+       auto unspec = aarch64_get_low_unspec (builtin_data.unspec);
+       auto mode = ops.last ().mode;
+       if (BYTES_BIG_ENDIAN)
+         icode = code_for_aarch64_high_be (unspec, mode);
+       else
+         icode = code_for_aarch64_high_le (unspec, mode);
+       break;
+      }
+
     case UNSPEC_LUTI2:
     case UNSPEC_LUTI4:
       create_integer_operand (ops.safe_push ({}),
diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
index dba103a7fb1..ae255889f5e 100644
--- a/gcc/config/aarch64/aarch64-c.cc
+++ b/gcc/config/aarch64/aarch64-c.cc
@@ -268,6 +268,8 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
   aarch64_def_or_undef (TARGET_SVE_BF16,
                        "__ARM_FEATURE_SVE_BF16", pfile);
 
+  aarch64_def_or_undef (TARGET_FP8, "__ARM_FEATURE_FP8", pfile);
+
   aarch64_def_or_undef (TARGET_LS64,
                        "__ARM_FEATURE_LS64", pfile);
   aarch64_def_or_undef (TARGET_RCPC, "__ARM_FEATURE_RCPC", pfile);
diff --git a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def 
b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
index bc9a63b968a..6221652b38f 100644
--- a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
@@ -20,11 +20,19 @@
 
 #undef ENTRY_BINARY
 #define ENTRY_BINARY(N, T0, T1, T2, U, F)      \
-  ENTRY (N, binary, T0, T1, T2, U, F)
+  ENTRY (N, binary, T0, T1, T2, none, U, F)
 
 #undef ENTRY_BINARY_LANE
 #define ENTRY_BINARY_LANE(N, T0, T1, T2, U, F) \
-  ENTRY (N, binary_lane, T0, T1, T2, U, F)
+  ENTRY (N, binary_lane, T0, T1, T2, none, U, F)
+
+#undef ENTRY_TERNARY
+#define ENTRY_TERNARY(N, T0, T1, T2, T3, U, F) \
+  ENTRY (N, ternary, T0, T1, T2, T3, U, F)
+
+#undef ENTRY_UNARY
+#define ENTRY_UNARY(N, T0, T1, U, F)   \
+  ENTRY (N, unary, T0, T1, none, none, U, F)
 
 #undef ENTRY_BINARY_VHSDF
 #define ENTRY_BINARY_VHSDF(NAME, UNSPEC, FLAGS)                        \
@@ -34,6 +42,14 @@
   ENTRY_BINARY (NAME##q_f32, f32q, f32q, f32q, UNSPEC, FLAGS)  \
   ENTRY_BINARY (NAME##q_f64, f64q, f64q, f64q, UNSPEC, FLAGS)
 
+#undef ENTRY_BINARY_VHSDF_SIGNED
+#define ENTRY_BINARY_VHSDF_SIGNED(NAME, UNSPEC, FLAGS)         \
+  ENTRY_BINARY (NAME##_f16, f16, f16, s16, UNSPEC, FLAGS)      \
+  ENTRY_BINARY (NAME##q_f16, f16q, f16q, s16q, UNSPEC, FLAGS)  \
+  ENTRY_BINARY (NAME##_f32, f32, f32, s32, UNSPEC, FLAGS)      \
+  ENTRY_BINARY (NAME##q_f32, f32q, f32q, s32q, UNSPEC, FLAGS)  \
+  ENTRY_BINARY (NAME##q_f64, f64q, f64q, s64q, UNSPEC, FLAGS)
+
 #undef ENTRY_TERNARY_VLUT8
 #define ENTRY_TERNARY_VLUT8(T)                                 \
   ENTRY_BINARY_LANE (vluti2_lane_##T##8, T##8q, T##8, u8,      \
@@ -64,6 +80,11 @@
   ENTRY_BINARY_LANE (vluti4q_laneq_##T##16_x2, T##16q, T##16qx2, u8q,  \
                     UNSPEC_LUTI4, QUIET)
 
+#undef ENTRY_UNARY_VQ_BHF
+#define ENTRY_UNARY_VQ_BHF(N, T1, UNSPEC, FLAGS)               \
+  ENTRY_UNARY (N##_bf16_mf8_fpm, bf16q, T1, UNSPEC, FLAGS)     \
+  ENTRY_UNARY (N##_f16_mf8_fpm, f16q, T1, UNSPEC, FLAGS)
+
 // faminmax
 #define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FAMINMAX)
 ENTRY_BINARY_VHSDF (vamax, UNSPEC_FAMAX, FP)
@@ -82,3 +103,25 @@ ENTRY_TERNARY_VLUT16 (p)
 ENTRY_TERNARY_VLUT16 (s)
 ENTRY_TERNARY_VLUT16 (u)
 #undef REQUIRED_EXTENSIONS
+
+// fpm conversion
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8)
+ENTRY_UNARY_VQ_BHF (vcvt1, f8, UNSPEC_F1CVTL_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt1_high, f8q, UNSPEC_F1CVTL2_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt1_low, f8q, UNSPEC_F1CVTL_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt2, f8, UNSPEC_F2CVTL_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt2_high, f8q, UNSPEC_F2CVTL2_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt2_low, f8q, UNSPEC_F2CVTL_FP8, FP8)
+
+ENTRY_BINARY (vcvt_mf8_f16_fpm, f8, f16, f16, UNSPEC_FCVTN_FP8, FP8)
+ENTRY_BINARY (vcvtq_mf8_f16_fpm, f8q, f16q, f16q, UNSPEC_FCVTN_FP8, FP8)
+ENTRY_BINARY (vcvt_mf8_f32_fpm, f8, f32q, f32q, UNSPEC_FCVTN_FP8, FP8)
+
+ENTRY_TERNARY (vcvt_high_mf8_f32_fpm, f8q, f8, f32q, f32q,
+              UNSPEC_FCVTN2_FP8, FP8)
+#undef REQUIRED_EXTENSIONS
+
+// fpm scaling
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8)
+ENTRY_BINARY_VHSDF_SIGNED (vscale, UNSPEC_FSCALE, FP)
+#undef REQUIRED_EXTENSIONS
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 05cbd38372d..f38bad72781 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -10024,3 +10024,76 @@ (define_insn "@aarch64_lut<VLUTx2:mode><VB:mode>"
   "TARGET_LUT && INTVAL (operands[4]) == 4"
   "luti%4\t%0.8h, {%S1.8h, %T1.8h}, %2[%3]"
 )
+
+;; fpm unary instructions (low part).
+(define_insn "@aarch64_<insn><mode>"
+  [(set (match_operand:VQ_BHF 0 "register_operand" "=w")
+       (unspec:VQ_BHF
+        [(match_operand:V8QI 1 "register_operand" "w")
+         (reg:DI FPM_REGNUM)]
+       FPM_UNARY_UNS))]
+  "TARGET_FP8"
+  "<b><insn>\t%0.<Vtype>, %1.8b"
+)
+
+;; fpm unary instructions (high part).
+(define_insn "@aarch64_<insn><mode>_high"
+  [(set (match_operand:VQ_BHF 0 "register_operand" "=w")
+       (unspec:VQ_BHF
+        [(vec_select:V8QI
+           (match_operand:V16QI 1 "register_operand" "w")
+           (match_operand:V16QI 2 "vect_par_cnst_hi_half"))
+         (reg:DI FPM_REGNUM)]
+       FPM_UNARY_UNS))]
+  "TARGET_FP8"
+  "<b><insn>2\t%0.<Vtype>, %1.16b"
+)
+
+;; fpm binary instructions.
+(define_insn "@aarch64_<insn><mode>"
+  [(set (match_operand:<VPACKB> 0 "register_operand" "=w")
+       (unspec:<VPACKB>
+        [(match_operand:VCVTFPM 1 "register_operand" "w")
+         (match_operand:VCVTFPM 2 "register_operand" "w")
+         (reg:DI FPM_REGNUM)]
+        FPM_BINARY_UNS))]
+  "TARGET_FP8"
+  "<insn>\t%0.<VPACKBtype>, %1.<Vtype>, %2.<Vtype>"
+)
+
+;; fpm binary instructions & merge with low.
+(define_insn "@aarch64_<insn><mode>_high_le"
+  [(set (match_operand:V16QI 0 "register_operand" "=w")
+       (vec_concat:V16QI
+         (match_operand:V8QI 1 "register_operand" "0")
+         (unspec:V8QI
+           [(match_operand:V4SF_ONLY 2 "register_operand" "w")
+            (match_operand:V4SF_ONLY 3 "register_operand" "w")
+            (reg:DI FPM_REGNUM)]
+           FPM_BINARY_UNS)))]
+  "TARGET_FP8 && !BYTES_BIG_ENDIAN"
+  "<insn>2\t%1.16b, %2.<V4SF_ONLY:Vtype>, %3.<V4SF_ONLY:Vtype>";
+)
+
+(define_insn "@aarch64_<insn><mode>_high_be"
+  [(set (match_operand:V16QI 0 "register_operand" "=w")
+       (vec_concat:V16QI
+         (unspec:V8QI
+           [(match_operand:V4SF_ONLY 2 "register_operand" "w")
+            (match_operand:V4SF_ONLY 3 "register_operand" "w")
+            (reg:DI FPM_REGNUM)]
+           FPM_BINARY_UNS)
+         (match_operand:V8QI 1 "register_operand" "0")))]
+  "TARGET_FP8 && BYTES_BIG_ENDIAN"
+  "<insn>2\t%1.16b, %2.<V4SF_ONLY:Vtype>, %3.<V4SF_ONLY:Vtype>";
+)
+
+;; fscale instructions
+(define_insn "@aarch64_<insn><mode>"
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
+                      (match_operand:<FCVT_TARGET> 2 "register_operand" "w")]
+                     FSCALE_UNS))]
+  "TARGET_FP8"
+  "<insn>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+)
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 90725c7faeb..7b426aae7a8 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -41,6 +41,7 @@ (define_mode_iterator SHORT [QI HI])
 ;; Iterators for single modes, for "@" patterns.
 (define_mode_iterator SI_ONLY [SI])
 (define_mode_iterator DI_ONLY [DI])
+(define_mode_iterator V4SF_ONLY [V4SF])
 
 ;; Iterator for all integer modes (up to 64-bit)
 (define_mode_iterator ALLI [QI HI SI DI])
@@ -181,6 +182,9 @@ (define_mode_iterator VSFDF [V2SF V4SF V2DF DF SF])
 ;; Advanced SIMD single Float modes.
 (define_mode_iterator VDQSF [V2SF V4SF])
 
+;; Quad vector float modes with half/bfloat elements.
+(define_mode_iterator VQ_BHF [V8HF V8BF])
+
 ;; Quad vector Float modes with half/single elements.
 (define_mode_iterator VQ_HSF [V8HF V4SF])
 
@@ -430,6 +434,9 @@ (define_mode_iterator VMULD [V4HI V8HI V2SI V4SI
 (define_mode_iterator VLUT [V8QI V16QI V4HI V4HF V4BF])
 (define_mode_iterator VLUTx2 [V2x8HI V2x8HF V2x8BF])
 
+;; Modes available for Advanced SIMD FP8 conversion operations.
+(define_mode_iterator VCVTFPM [V4HF V8HF V4SF])
+
 ;; Iterators for single modes, for "@" patterns.
 (define_mode_iterator VNx16QI_ONLY [VNx16QI])
 (define_mode_iterator VNx16SI_ONLY [VNx16SI])
@@ -715,6 +722,12 @@ (define_c_enum "unspec"
     UNSPEC_ASHIFT_SIGNED       ; Used in aarch-simd.md.
     UNSPEC_ASHIFT_UNSIGNED     ; Used in aarch64-simd.md.
     UNSPEC_ABS         ; Used in aarch64-simd.md.
+    UNSPEC_FCVTN_FP8   ; Used in aarch64-simd.md.
+    UNSPEC_FCVTN2_FP8  ; Used in aarch64-builtins.cc.
+    UNSPEC_F1CVTL_FP8  ; Used in aarch64-simd.md.
+    UNSPEC_F1CVTL2_FP8 ; Used in aarch64-builtins.cc.
+    UNSPEC_F2CVTL_FP8  ; Used in aarch64-simd.md.
+    UNSPEC_F2CVTL2_FP8 ; Used in aarch64-builtins.cc.
     UNSPEC_FMAX                ; Used in aarch64-simd.md.
     UNSPEC_FMAXNMV     ; Used in aarch64-simd.md.
     UNSPEC_FMAXV       ; Used in aarch64-simd.md.
@@ -723,6 +736,7 @@ (define_c_enum "unspec"
     UNSPEC_FMINV       ; Used in aarch64-simd.md.
     UNSPEC_FADDV       ; Used in aarch64-simd.md.
     UNSPEC_FNEG                ; Used in aarch64-simd.md.
+    UNSPEC_FSCALE      ; Used in aarch64-simd.md.
     UNSPEC_ADDV                ; Used in aarch64-simd.md.
     UNSPEC_SMAXV       ; Used in aarch64-simd.md.
     UNSPEC_SMINV       ; Used in aarch64-simd.md.
@@ -1790,6 +1804,11 @@ (define_mode_attr Vntype [(V8HI "8b") (V4SI "4h")
 (define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h")
                           (V2DI "4s")])
 
+;; The result of FCVTN on two vectors of the given mode.  The result has
+;; twice as many QI elements as the input.
+(define_mode_attr VPACKB [(V4HF "V8QI") (V8HF "V16QI") (V4SF "V8QI")])
+(define_mode_attr VPACKBtype [(V4HF "8b") (V8HF "16b") (V4SF "8b")])
+
 ;; Widened modes of vector modes.
 (define_mode_attr VWIDE [(V8QI  "V8HI")  (V4HI  "V4SI")
                         (V2SI  "V2DI")  (V16QI "V8HI")
@@ -2547,7 +2566,8 @@ (define_mode_attr vec_or_offset [(V8QI "vec") (V16QI 
"vec") (V4HI "vec")
                                 (V8HI "vec") (V2SI "vec") (V4SI "vec")
                                 (V2DI "vec") (DI "offset")])
 
-(define_mode_attr b [(VNx8BF "b") (VNx8HF "") (VNx4SF "") (VNx2DF "")
+(define_mode_attr b [(V4BF "b") (V4HF "") (V8BF "b") (V8HF "")
+                    (VNx8BF "b") (VNx8HF "") (VNx4SF "") (VNx2DF "")
                     (VNx16BF "b") (VNx16HF "") (VNx8SF "") (VNx4DF "")
                     (VNx32BF "b") (VNx32HF "") (VNx16SF "") (VNx8DF "")])
 
@@ -3794,10 +3814,25 @@ (define_int_iterator SVE2_FP8_TERNARY_LANE_VNX4SF
    UNSPEC_FMLALLTB_FP8
    UNSPEC_FMLALLTT_FP8])
 
+;; Iterators for fpm instructions
+
+(define_int_iterator FPM_UNARY_UNS [UNSPEC_F1CVTL_FP8 UNSPEC_F2CVTL_FP8])
+
+(define_int_iterator FPM_BINARY_UNS [UNSPEC_FCVTN_FP8])
+
+(define_int_iterator FSCALE_UNS [UNSPEC_FSCALE])
+
 ;; -------------------------------------------------------------------
 ;; Int Iterators Attributes.
 ;; -------------------------------------------------------------------
 
+;; The AArch64 insn mnemonic associated with an unspec.
+(define_int_attr insn
+  [(UNSPEC_F1CVTL_FP8 "f1cvtl")
+   (UNSPEC_F2CVTL_FP8 "f2cvtl")
+   (UNSPEC_FCVTN_FP8 "fcvtn")
+   (UNSPEC_FSCALE "fscale")])
+
 ;; The optab associated with an operation.  Note that for ANDF, IORF
 ;; and XORF, the optab pattern is not actually defined; we just use this
 ;; name for consistency with the integer patterns.
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c 
b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
index afb44f83f60..635a7eaf4a2 100644
--- a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
+++ b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
@@ -5,19 +5,9 @@
 
 #include <arm_acle.h>
 
-#ifdef __ARM_FEATURE_FP8
-#error "__ARM_FEATURE_FP8 feature macro defined."
-#endif
-
 #pragma GCC push_options
 #pragma GCC target("arch=armv9.4-a+fp8")
 
-/* We do not define __ARM_FEATURE_FP8 until all
-   relevant features have been added. */
-#ifdef __ARM_FEATURE_FP8
-#error "__ARM_FEATURE_FP8 feature macro defined."
-#endif
-
 /*
 **test_write_fpmr_sysreg_asm_64:
 **     msr     fpmr, x0
diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c 
b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
index 37bd844f581..e5a19aaefb6 100644
--- a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
@@ -263,3 +263,13 @@
 #ifdef __ARM_FEATURE_GCS
 #error Foo
 #endif
+
+#pragma GCC target "arch=armv9-a"
+#ifdef __ARM_FEATURE_FP8
+#error Foo
+#endif
+
+#pragma GCC target "arch=armv9-a+fp8"
+#ifndef __ARM_FEATURE_FP8
+#error Foo
+#endif
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c 
b/gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c
new file mode 100644
index 00000000000..d95a861fcfd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vscale_f16:
+**     fscale  v0.4h, v0.4h, v1.4h
+**     ret
+*/
+float16x4_t
+test_vscale_f16 (float16x4_t a, int16x4_t b)
+{
+  return vscale_f16 (a, b);
+}
+
+/*
+** test_vscaleq_f16:
+**     fscale  v0.8h, v0.8h, v1.8h
+**     ret
+*/
+float16x8_t
+test_vscaleq_f16 (float16x8_t a, int16x8_t b)
+{
+  return vscaleq_f16 (a, b);
+}
+
+/*
+** test_vscale_f32:
+**     fscale  v0.2s, v0.2s, v1.2s
+**     ret
+*/
+float32x2_t
+test_vscale_f32 (float32x2_t a, int32x2_t b)
+{
+  return vscale_f32 (a, b);
+}
+
+/*
+** test_vscaleq_f32:
+**     fscale  v0.4s, v0.4s, v1.4s
+**     ret
+*/
+float32x4_t
+test_vscaleq_f32 (float32x4_t a, int32x4_t b)
+{
+  return vscaleq_f32 (a, b);
+}
+
+/*
+** test_vscaleq_f64:
+**     fscale  v0.2d, v0.2d, v1.2d
+**     ret
+*/
+float64x2_t
+test_vscaleq_f64 (float64x2_t a, int64x2_t b)
+{
+  return vscaleq_f64 (a, b);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c 
b/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c
new file mode 100644
index 00000000000..39076684345
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c
@@ -0,0 +1,197 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vcvt1_bf16:
+**     msr     fpmr, x0
+**     bf1cvtl v0.8h, v0.8b
+**     ret
+*/
+bfloat16x8_t
+test_vcvt1_bf16 (mfloat8x8_t a, fpm_t b)
+{
+  return vcvt1_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt1_bf16:
+**     msr     fpmr, x0
+**     bf1cvtl2        v0.8h, v0.16b
+**     ret
+*/
+bfloat16x8_t
+test_high_vcvt1_bf16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt1_high_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt1_bf16:
+**     msr     fpmr, x0
+**     bf1cvtl v0.8h, v0.8b
+**     ret
+*/
+bfloat16x8_t
+test_low_vcvt1_bf16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt1_low_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt1_f16:
+**     msr     fpmr, x0
+**     f1cvtl  v0.8h, v0.8b
+**     ret
+*/
+float16x8_t
+test_vcvt1_f16 (mfloat8x8_t a, fpm_t b)
+{
+  return vcvt1_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt1_f16:
+**     msr     fpmr, x0
+**     f1cvtl2 v0.8h, v0.16b
+**     ret
+*/
+float16x8_t
+test_high_vcvt1_f16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt1_high_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt1_f16:
+**     msr     fpmr, x0
+**     f1cvtl  v0.8h, v0.8b
+**     ret
+*/
+float16x8_t
+test_low_vcvt1_f16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt1_low_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt2_bf16:
+**     msr     fpmr, x0
+**     bf2cvtl v0.8h, v0.8b
+**     ret
+*/
+bfloat16x8_t
+test_vcvt2_bf16 (mfloat8x8_t a, fpm_t b)
+{
+  return vcvt2_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt2_bf16:
+**     msr     fpmr, x0
+**     bf2cvtl2        v0.8h, v0.16b
+**     ret
+*/
+bfloat16x8_t
+test_high_vcvt2_bf16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt2_high_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt2_bf16:
+**     msr     fpmr, x0
+**     bf1cvtl v0.8h, v0.8b
+**     ret
+*/
+bfloat16x8_t
+test_low_vcvt2_bf16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt1_low_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt2_f16:
+**     msr     fpmr, x0
+**     f2cvtl  v0.8h, v0.8b
+**     ret
+*/
+float16x8_t
+test_vcvt2_f16 (mfloat8x8_t a, fpm_t b)
+{
+  return vcvt2_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt2_f16:
+**     msr     fpmr, x0
+**     f2cvtl2 v0.8h, v0.16b
+**     ret
+*/
+float16x8_t
+test_high_vcvt2_f16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt2_high_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt2_f16:
+**     msr     fpmr, x0
+**     f1cvtl  v0.8h, v0.8b
+**     ret
+*/
+float16x8_t
+test_low_vcvt2_f16 (mfloat8x16_t a, fpm_t b)
+{
+  return vcvt1_low_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt_f16:
+**     msr     fpmr, x0
+**     fcvtn   v0.8b, v0.4h, v1.4h
+**     ret
+*/
+mfloat8x8_t
+test_vcvt_f16 (float16x4_t a, float16x4_t b, fpm_t c)
+{
+  return vcvt_mf8_f16_fpm(a, b, c);
+}
+
+/*
+** test_vcvtq_f16:
+**     msr     fpmr, x0
+**     fcvtn   v0.16b, v0.8h, v1.8h
+**     ret
+*/
+mfloat8x16_t
+test_vcvtq_f16 (float16x8_t a, float16x8_t b, fpm_t c)
+{
+  return vcvtq_mf8_f16_fpm(a, b, c);
+}
+
+/*
+** test_vcvt_f32:
+**     msr     fpmr, x0
+**     fcvtn   v0.8b, v0.4s, v1.4s
+**     ret
+*/
+mfloat8x8_t
+test_vcvt_f32 (float32x4_t a, float32x4_t b, fpm_t c)
+{
+  return vcvt_mf8_f32_fpm(a, b, c);
+}
+
+/*
+** test_vcvt_high_f32:
+**     msr     fpmr, x0
+**     fcvtn2  v0.16b, v1.4s, v2.4s
+**     ret
+*/
+mfloat8x16_t
+test_vcvt_high_f32 (mfloat8x8_t a, float32x4_t b, float32x4_t c, fpm_t d)
+{
+  return vcvt_high_mf8_f32_fpm(a, b, c, d);
+}
-- 
2.25.1

Re: [PATCH v2 1/3] aarch64: Add support for fp8 convert and scale

Reply via email to