Re: [PATCH v3 3/9] aarch64: add widening sme2 fp8 conversions

Richard Ball Tue, 25 Nov 2025 01:57:08 -0800

On Mon, Nov 24, 2025 at 06:01:13PM +0000, Claudio Bantaloukas wrote:
> 
> This patch adds the following intrinsics (all __arm_streaming only) along with
> asm tests for them under the +sme2+fp8 flags:
> - svfloat16x2_t svcvt1_f16[_mf8]_x2_fpm(svmfloat8_t zn, fpm_t fpm)
> - svfloat16x2_t svcvt2_f16[_mf8]_x2_fpm(svmfloat8_t zn, fpm_t fpm)
> - svfloat16x2_t svcvt1_bf16[_mf8]_x2_fpm(svmfloat8_t zn, fpm_t fpm)
> - svfloat16x2_t svcvt2_bf16[_mf8]_x2_fpm(svmfloat8_t zn, fpm_t fpm)
> - svfloat16x2_t svcvtl1_f16[_mf8]_x2_fpm(svmfloat8_t zn, fpm_t fpm)
> - svfloat16x2_t svcvtl2_f16[_mf8]_x2_fpm(svmfloat8_t zn, fpm_t fpm)
> - svfloat16x2_t svcvtl1_bf16[_mf8]_x2_fpm(svmfloat8_t zn, fpm_t fpm)
> - svfloat16x2_t svcvtl2_bf16[_mf8]_x2_fpm(svmfloat8_t zn, fpm_t fpm)
> 
> gcc/
>       * config/aarch64/aarch64-sve-builtins-sve2.cc (svcvtl1, svcvtl2): Added
>       new FUNTIONs.
>       * config/aarch64/aarch64-sve-builtins-sve2.def
>       (svcvt1, svcvt2, svcvtl1, svcvtl2): Added new DEF_SVE_FUNCTION_GS_FPM.
>       * config/aarch64/aarch64-sve-builtins-sve2.h (svcvtl1, svcvtl2): Added
>       new function_base.
>       * config/aarch64/aarch64-sve-builtins.cc
>       (function_resolver::resolve_unary): use group_suffix_id when resolving
>       C overloads.
>       * config/aarch64/aarch64-sve2.md
>       (@aarch64_sve2_fp8_cvt_<fp8_cvt_uns_op><mode>): Added new define_insn.
>       * config/aarch64/aarch64.h (TARGET_SSME2_FP8): Added new define.
>       * config/aarch64/iterators.md
>       (UNSPEC_F1CVTL. UNSPEC_F2CVTL): Added new unspecs.
>       (FP8CVT_UNS): Extended int_iterator.
>       (fp8_cvt_uns_op): Likewise.
> 
> gcc/testsuite/
>       * g++.target/aarch64/sme2/aarch64-sme2-acle-asm.exp: Use tuning flag
>       to reduce churn in testsuites.
>       * gcc.target/aarch64/sme2/aarch64-sme2-acle-asm.exp: Likewise.
>       * gcc.target/aarch64/sme2/acle-asm/cvt_mf8.c: Added test file.
>       * gcc.target/aarch64/sme2/acle-asm/cvtl_mf8.c: Likewise.
>       * gcc.target/aarch64/sve/acle/asm/test_sve_acle.h (TEST_X2_WIDE): Added
>       fpm0 argument for intrinsics.
> ---
>  .../aarch64/aarch64-sve-builtins-sve2.cc      |  2 +
>  .../aarch64/aarch64-sve-builtins-sve2.def     |  8 ++++
>  .../aarch64/aarch64-sve-builtins-sve2.h       |  2 +
>  gcc/config/aarch64/aarch64-sve-builtins.cc    |  3 +-
>  gcc/config/aarch64/aarch64-sve2.md            | 10 ++++
>  gcc/config/aarch64/aarch64.h                  |  2 +
>  gcc/config/aarch64/iterators.md               |  6 +++
>  .../aarch64/sme2/aarch64-sme2-acle-asm.exp    |  5 +-
>  .../aarch64/sme2/aarch64-sme2-acle-asm.exp    |  5 +-
>  .../aarch64/sme2/acle-asm/cvt_mf8.c           | 47 +++++++++++++++++++
>  .../aarch64/sme2/acle-asm/cvtl_mf8.c          | 47 +++++++++++++++++++
>  .../aarch64/sve/acle/asm/test_sve_acle.h      |  1 +
>  12 files changed, 135 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/cvt_mf8.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/cvtl_mf8.c
>


> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc 
> b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
> index 95c5ed81d61..ee392c3a745 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
> @@ -1020,6 +1020,8 @@ FUNCTION (svclamp, svclamp_impl,)
>  FUNCTION (svcvt1, svcvt_fp8_impl, (UNSPEC_F1CVT))
>  FUNCTION (svcvt2, svcvt_fp8_impl, (UNSPEC_F2CVT))
>  FUNCTION (svcvtl, svcvtl_impl,)
> +FUNCTION (svcvtl1, svcvt_fp8_impl, (UNSPEC_F1CVTL))
> +FUNCTION (svcvtl2, svcvt_fp8_impl, (UNSPEC_F2CVTL))
>  FUNCTION (svcvtlt1, svcvt_fp8_impl, (UNSPEC_F1CVTLT))
>  FUNCTION (svcvtlt2, svcvt_fp8_impl, (UNSPEC_F2CVTLT))
>  FUNCTION (svcvtlt, unspec_based_function, (-1, -1, UNSPEC_COND_FCVTLT))
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def 
> b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
> index b622fe33458..8029afd4007 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
> @@ -418,3 +418,11 @@ DEF_SVE_FUNCTION_GS_FPM (svdot_lane, 
> ternary_mfloat8_lane_group_selection, s_flo
>  DEF_SVE_FUNCTION_GS_FPM (svdot, ternary_mfloat8, h_float_mf8, none, none, 
> set)
>  DEF_SVE_FUNCTION_GS_FPM (svdot_lane, ternary_mfloat8_lane_group_selection, 
> h_float_mf8, none, none, set)
>  #undef REQUIRED_EXTENSIONS
> +
> +#define REQUIRED_EXTENSIONS \
> +  streaming_only (AARCH64_FL_SME2 | AARCH64_FL_FP8)
> +DEF_SVE_FUNCTION_GS_FPM (svcvt1, unary_convert, cvt_mf8, x2, none, set)
> +DEF_SVE_FUNCTION_GS_FPM (svcvt2, unary_convert, cvt_mf8, x2, none, set)
> +DEF_SVE_FUNCTION_GS_FPM (svcvtl1, unary_convert, cvt_mf8, x2, none, set)
> +DEF_SVE_FUNCTION_GS_FPM (svcvtl2, unary_convert, cvt_mf8, x2, none, set)
> +#undef REQUIRED_EXTENSIONS
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.h 
> b/gcc/config/aarch64/aarch64-sve-builtins-sve2.h
> index 6d7d0af2641..7c1745f5c9e 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.h
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.h
> @@ -64,6 +64,8 @@ namespace aarch64_sve
>      extern const function_base *const svcvt1;
>      extern const function_base *const svcvt2;
>      extern const function_base *const svcvtl;
> +    extern const function_base *const svcvtl1;
> +    extern const function_base *const svcvtl2;
>      extern const function_base *const svcvtlt;
>      extern const function_base *const svcvtlt1;
>      extern const function_base *const svcvtlt2;
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc 
> b/gcc/config/aarch64/aarch64-sve-builtins.cc
> index e8eeedb4d36..03481ee4a77 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
> @@ -3207,7 +3207,8 @@ function_resolver::resolve_unary (type_class_index 
> merge_tclass,
>    /* Handle convert-like functions in which the first type suffix is
>       explicit.  */
>    if (type_suffix_ids[0] != NUM_TYPE_SUFFIXES)
> -    return resolve_to (mode_suffix_id, type_suffix_ids[0], type);
> +    return resolve_to (mode_suffix_id, type_suffix_ids[0], type,
> +                    group_suffix_id);
>  
>    return resolve_to (mode_suffix_id, type);
>  }
> diff --git a/gcc/config/aarch64/aarch64-sve2.md 
> b/gcc/config/aarch64/aarch64-sve2.md
> index 91091835182..ab8098d3327 100644
> --- a/gcc/config/aarch64/aarch64-sve2.md
> +++ b/gcc/config/aarch64/aarch64-sve2.md
> @@ -3591,6 +3591,16 @@ (define_insn "@aarch64_sve_cvtl<mode>"
>    [(set_attr "sve_type" "sve_fp_cvt")]
>  )
>  
> +(define_insn "@aarch64_sve2_fp8_cvt_<fp8_cvt_uns_op><mode>"
> +  [(set (match_operand:SVE_FULL_HFx2 0 "aligned_register_operand" "=Uw2")
> +     (unspec:SVE_FULL_HFx2
> +       [(match_operand:VNx16QI 1 "register_operand" "w")
> +       (reg:DI FPM_REGNUM)]
> +       FP8CVT_UNS))]
> +  "TARGET_SSME2_FP8"
> +  "<b><fp8_cvt_uns_op>\t%0, %1.b"
> +)
> +
>  ;; -------------------------------------------------------------------------
>  ;; ---- [FP<-FP] Multi-vector narrowing conversions
>  ;; -------------------------------------------------------------------------
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index 0e596b59744..58a17d1f5af 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -531,6 +531,8 @@ through +ssve-fp8dot2.  */
>               (TARGET_SVE2 && TARGET_FP8DOT2) || TARGET_STREAMING) \
>               && (AARCH64_HAVE_ISA(SSVE_FP8DOT2) || TARGET_NON_STREAMING))
>  
> +#define TARGET_SSME2_FP8 (TARGET_FP8 && TARGET_STREAMING_SME2)
> +
>  /* Standard register usage.  */
>  
>  /* 31 64-bit general purpose registers R0-R30:
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 0c80b7adeae..a3416dbcbb3 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -1051,8 +1051,10 @@ (define_c_enum "unspec"
>      UNSPEC_EORBT     ; Used in aarch64-sve2.md.
>      UNSPEC_EORTB     ; Used in aarch64-sve2.md.
>      UNSPEC_F1CVT     ; Used in aarch64-sve2.md.
> +    UNSPEC_F1CVTL    ; Used in aarch64-sve2.md.
>      UNSPEC_F1CVTLT   ; Used in aarch64-sve2.md.
>      UNSPEC_F2CVT     ; Used in aarch64-sve2.md.
> +    UNSPEC_F2CVTL    ; Used in aarch64-sve2.md.
>      UNSPEC_F2CVTLT   ; Used in aarch64-sve2.md.
>      UNSPEC_FADDP     ; Used in aarch64-sve2.md.
>      UNSPEC_FCVTNB    ; Used in aarch64-sve2.md.
> @@ -4006,6 +4008,8 @@ (define_int_iterator SET_FPSCR
>  (define_int_iterator FP8CVT_UNS
>    [UNSPEC_F1CVT
>     UNSPEC_F2CVT
> +   UNSPEC_F1CVTL
> +   UNSPEC_F2CVTL
>     UNSPEC_F1CVTLT
>     UNSPEC_F2CVTLT])
>  
> @@ -5140,5 +5144,7 @@ (define_int_attr atomic_ldoptab
>  (define_int_attr fp8_cvt_uns_op
>    [(UNSPEC_F1CVT "f1cvt")
>     (UNSPEC_F2CVT "f2cvt")
> +   (UNSPEC_F1CVTL "f1cvtl")
> +   (UNSPEC_F2CVTL "f2cvtl")
>     (UNSPEC_F1CVTLT "f1cvtlt")
>     (UNSPEC_F2CVTLT "f2cvtlt")])
> diff --git a/gcc/testsuite/g++.target/aarch64/sme2/aarch64-sme2-acle-asm.exp 
> b/gcc/testsuite/g++.target/aarch64/sme2/aarch64-sme2-acle-asm.exp
> index 334b1108ddc..74e46933ca3 100644
> --- a/gcc/testsuite/g++.target/aarch64/sme2/aarch64-sme2-acle-asm.exp
> +++ b/gcc/testsuite/g++.target/aarch64/sme2/aarch64-sme2-acle-asm.exp
> @@ -39,7 +39,10 @@ if { [check_effective_target_aarch64_sme2] } {
>  
>  # Turn off any codegen tweaks by default that may affect expected assembly.
>  # Tests relying on those should turn them on explicitly.
> -set sme2_flags "$sme2_flags -mtune=generic -moverride=tune=none"
> +set sme2_flags "$sme2_flags -mtune=generic "
> +set sme2_flags "$sme2_flags -moverride=tune=none"
> +# Reduce testsuite churn when writing to fmpr
> +set sme2_flags "$sme2_flags -moverride=tune=cheap_fpmr_write"

I am inclined to feel that this is more readable like this:

set sme2_flags "&sme2_flags -mtune=generic
                            -moverride=tune-none
                            -moverride=tune=cheap_fpmr_write"

  
>  global gcc_runtest_parallelize_limit_minor
>  if { [info exists gcc_runtest_parallelize_limit_minor] } {
> diff --git a/gcc/testsuite/gcc.target/aarch64/sme2/aarch64-sme2-acle-asm.exp 
> b/gcc/testsuite/gcc.target/aarch64/sme2/aarch64-sme2-acle-asm.exp
> index 6bd8784779c..42b6fccf865 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sme2/aarch64-sme2-acle-asm.exp
> +++ b/gcc/testsuite/gcc.target/aarch64/sme2/aarch64-sme2-acle-asm.exp
> @@ -39,7 +39,10 @@ if { [check_effective_target_aarch64_sme2] } {
>  
>  # Turn off any codegen tweaks by default that may affect expected assembly.
>  # Tests relying on those should turn them on explicitly.
> -set sme2_flags "$sme2_flags -mtune=generic -moverride=tune=none"
> +set sme2_flags "$sme2_flags -mtune=generic "
> +set sme2_flags "$sme2_flags -moverride=tune=none"
> +# Reduce testsuite churn when writing to fmpr
> +set sme2_flags "$sme2_flags -moverride=tune=cheap_fpmr_write"
> 

Same as above

>  global gcc_runtest_parallelize_limit_minor
>  if { [info exists gcc_runtest_parallelize_limit_minor] } {
> diff --git a/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/cvt_mf8.c 
> b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/cvt_mf8.c
> new file mode 100644
> index 00000000000..0fb20c8edf8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/cvt_mf8.c
> @@ -0,0 +1,47 @@
> +/* { dg-do assemble { target { aarch64_asm_fp8_ok && aarch64_asm_sme2_ok } } 
> } */
> +/* { dg-do compile { target { ! { aarch64_asm_fp8_ok && aarch64_asm_sme2_ok 
> } } } } */
> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
> +
> +#include "test_sme2_acle.h"
> +
> +#pragma GCC target "+fp8"
> +
> +/*
> +** cvt1_f16_mf8_x2_fpm:
> +**   msr     fpmr, x0
> +**   f1cvt   {z0\.h - z1\.h}, z0\.b
> +**   ret
> +*/
> +TEST_X2_WIDE (cvt1_f16_mf8_x2_fpm, svfloat16x2_t, svmfloat8_t,
> +           z0_res = svcvt1_f16_mf8_x2_fpm (z0, fpm0),
> +           z0_res = svcvt1_f16_x2_fpm (z0, fpm0))
> +
> +/*
> +** cvt1_bf16_mf8_x2_fpm:
> +**   msr     fpmr, x0
> +**   bf1cvt  {z0\.h - z1\.h}, z0\.b
> +**   ret
> +*/
> +TEST_X2_WIDE (cvt1_bf16_mf8_x2_fpm, svbfloat16x2_t, svmfloat8_t,
> +           z0_res = svcvt1_bf16_mf8_x2_fpm (z0, fpm0),
> +           z0_res = svcvt1_bf16_x2_fpm (z0, fpm0))
> +
> +/*
> +** cvt2_f16_mf8_x2_fpm:
> +**   msr     fpmr, x0
> +**   f2cvt   {z0\.h - z1\.h}, z0\.b
> +**   ret
> +*/
> +TEST_X2_WIDE (cvt2_f16_mf8_x2_fpm, svfloat16x2_t, svmfloat8_t,
> +           z0_res = svcvt2_f16_mf8_x2_fpm (z0, fpm0),
> +           z0_res = svcvt2_f16_x2_fpm (z0, fpm0))
> +
> +/*
> +** cvt2_bf16_mf8_x2_fpm:
> +**   msr     fpmr, x0
> +**   bf2cvt  {z0\.h - z1\.h}, z0\.b
> +**   ret
> +*/
> +TEST_X2_WIDE (cvt2_bf16_mf8_x2_fpm, svbfloat16x2_t, svmfloat8_t,
> +           z0_res = svcvt2_bf16_mf8_x2_fpm (z0, fpm0),
> +           z0_res = svcvt2_bf16_x2_fpm (z0, fpm0))

Should this file have an _x2 in the name for clarity?


> diff --git a/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/cvtl_mf8.c 
> b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/cvtl_mf8.c
> new file mode 100644
> index 00000000000..8a8326bd068
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/cvtl_mf8.c
> @@ -0,0 +1,47 @@
> +/* { dg-do assemble { target { aarch64_asm_fp8_ok && aarch64_asm_sme2_ok } } 
> } */
> +/* { dg-do compile { target { ! { aarch64_asm_fp8_ok && aarch64_asm_sme2_ok 
> } } } } */
> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
> +
> +#include "test_sme2_acle.h"
> +
> +#pragma GCC target "+fp8"
> +
> +/*
> +** cvtl1_f16_mf8_x2_fpm:
> +**   msr     fpmr, x0
> +**   f1cvtl  {z0\.h - z1\.h}, z0\.b
> +**   ret
> +*/
> +TEST_X2_WIDE (cvtl1_f16_mf8_x2_fpm, svfloat16x2_t, svmfloat8_t,
> +           z0_res = svcvtl1_f16_mf8_x2_fpm (z0, fpm0),
> +           z0_res = svcvtl1_f16_x2_fpm (z0, fpm0))
> +
> +/*
> +** cvtl1_bf16_mf8_x2_fpm:
> +**   msr     fpmr, x0
> +**   bf1cvtl {z0\.h - z1\.h}, z0\.b
> +**   ret
> +*/
> +TEST_X2_WIDE (cvtl1_bf16_mf8_x2_fpm, svbfloat16x2_t, svmfloat8_t,
> +           z0_res = svcvtl1_bf16_mf8_x2_fpm (z0, fpm0),
> +           z0_res = svcvtl1_bf16_x2_fpm (z0, fpm0))
> +
> +/*
> +** cvtl2_f16_mf8_x2_fpm:
> +**   msr     fpmr, x0
> +**   f2cvtl  {z0\.h - z1\.h}, z0\.b
> +**   ret
> +*/
> +TEST_X2_WIDE (cvtl2_f16_mf8_x2_fpm, svfloat16x2_t, svmfloat8_t,
> +           z0_res = svcvtl2_f16_mf8_x2_fpm (z0, fpm0),
> +           z0_res = svcvtl2_f16_x2_fpm (z0, fpm0))
> +
> +/*
> +** cvtl2_bf16_mf8_x2_fpm:
> +**   msr     fpmr, x0
> +**   bf2cvtl {z0\.h - z1\.h}, z0\.b
> +**   ret
> +*/
> +TEST_X2_WIDE (cvtl2_bf16_mf8_x2_fpm, svbfloat16x2_t, svmfloat8_t,
> +           z0_res = svcvtl2_bf16_mf8_x2_fpm (z0, fpm0),
> +           z0_res = svcvtl2_bf16_x2_fpm (z0, fpm0))

Same as above, an _x2 in the name?

> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h 
> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
> index 0adb39ad8b2..7c156c4cf2a 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
> @@ -767,6 +767,7 @@
>  #define TEST_X2_WIDE(NAME, TTYPE, ZTYPE, CODE1, CODE2)               \
>    PROTO (NAME, void, ())                                     \
>    {                                                          \
> +    register fpm_t fpm0 __asm ("x0");                                \
>      register ZTYPE z0 __asm ("z0");                          \
>      register ZTYPE z5 __asm ("z5");                          \
>      register TTYPE z6 __asm ("z6");                          \

Re: [PATCH v3 3/9] aarch64: add widening sme2 fp8 conversions

Reply via email to