On Wed, Mar 25, 2026 at 12:46 PM Pengxuan Zheng
<[email protected]> wrote:
>
> This enables the vectorizer to vectorize conversion from long to float for
> aarch64 target.
>
> Bootstrapped and tested on aarch64_linux_gnu.
>
>         PR target/123748
>
> gcc/ChangeLog:
>
>         * config/aarch64/aarch64-simd.md (vec_packs_float_v2di): New pattern.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/pr123748.c: New test.
>
> Signed-off-by: Pengxuan Zheng <[email protected]>
> ---
>  gcc/config/aarch64/aarch64-simd.md          | 27 +++++++++++++++++++++
>  gcc/testsuite/gcc.target/aarch64/pr123748.c | 13 ++++++++++
>  2 files changed, 40 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr123748.c
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index c314e85927d..b4f98e45a8e 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3203,6 +3203,33 @@ (define_insn 
> "<FCVT_FIXED2F:fcvt_fixed_insn><VDQ_HSDI:mode>3"
>    [(set_attr "type" "neon_int_to_fp_<VDQ_HSDI:stype><q>")]
>  )
>
> +(define_expand "vec_packs_float_v2di"
> +  [(set (match_operand:V4SF 0 "register_operand" "=w")
> +       (vec_concat:V4SF
> +         (float:V2SF (match_operand:V2DI 1 "register_operand" "w"))
> +         (float:V2SF (match_operand:V2DI 2 "register_operand" "w"))))]
> +  "TARGET_SIMD && flag_unsafe_math_optimizations"
> +  {
> +    rtx tmp = gen_reg_rtx (V2DFmode);
> +    rtx tmp1 = gen_reg_rtx (V2DFmode);
> +    rtx tmp2 = gen_reg_rtx (V2SFmode);
> +    rtx tmp3 = gen_reg_rtx (V2SFmode);
> +    emit_insn (gen_floatv2div2df2 (tmp, operands[1]));
> +    emit_insn (gen_floatv2div2df2 (tmp1, operands[2]));
> +    emit_insn (gen_truncv2dfv2sf2 (tmp2, tmp));
> +    emit_insn (gen_truncv2dfv2sf2 (tmp3, tmp1));

Since this is a double rounding (DI->DF->SF), then
flag_unsafe_math_optimizations is the correct check as you have
already.

> +    rtx tmp4 = gen_reg_rtx (V2DFmode);
> +    if (BYTES_BIG_ENDIAN)
> +      emit_insn (gen_aarch64_zip1v2df_low (tmp4, gen_lowpart (DFmode, tmp3),
> +                                          gen_lowpart (DFmode, tmp2)));
> +    else
> +      emit_insn (gen_aarch64_zip1v2df_low (tmp4, gen_lowpart (DFmode, tmp2),
> +                                          gen_lowpart (DFmode, tmp3)));

This can be simplified into just:
if (BYTES_BIG_ENDIAN)
  std::swap (tmp2, tmp3);
emit_insn (gen_aarch64_zip1v2df_low (tmp4, gen_lowpart (DFmode, tmp2),

  gen_lowpart (DFmode, tmp3)));

Note I think we should be using di mode here rather than df mode.

Thanks,
Andrea


> +    emit_move_insn (operands[0], gen_lowpart (V4SFmode, tmp4));
> +    DONE;
> +  }
> +)
> +
>  ;; ??? Note that the vectorizer usage of the vec_unpacks_[lo/hi] patterns
>  ;; is inconsistent with vector ordering elsewhere in the compiler, in that
>  ;; the meaning of HI and LO changes depending on the target endianness.
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr123748.c 
> b/gcc/testsuite/gcc.target/aarch64/pr123748.c
> new file mode 100644
> index 00000000000..8ba290cf12d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr123748.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast" } */
> +
> +void
> +f (float *__restrict f, long *__restrict l)
> +{
> +  for (int i = 0; i < 128; i++)
> +    f[i] = l[i];
> +}
> +
> +/* { dg-final { scan-assembler-times {scvtf\t} 2 } } */
> +/* { dg-final { scan-assembler-times {fcvtn\t} 2 } } */
> +/* { dg-final { scan-assembler-times {zip1\t} 1 } } */
> --
> 2.34.1
>

Reply via email to