RE: [PATCH 3/4] aarch64: implement vec_concat support for sub-64-bit types

Tamar Christina Wed, 13 May 2026 01:21:36 -0700

Hi Artemiy,

> -----Original Message-----
> From: Artemiy Volkov <[email protected]>
> Sent: 27 April 2026 09:06
> To: [email protected]
> Cc: Tamar Christina <[email protected]>; Wilco Dijkstra
> <[email protected]>; [email protected]; Richard
> Earnshaw <[email protected]>; [email protected]; Alice
> Carlotti <[email protected]>; Alex Coplan <[email protected]>;
> Artemiy Volkov <[email protected]>
> Subject: [PATCH 3/4] aarch64: implement vec_concat support for sub-64-bit
> types
> 
> This patch improves handling of 2-element vec_concats in
> aarch64_vector_init_fallback (); where previously the aarch64_vec_concat
> insn was emitted only for pairs of vectors, we now allow scalar operands
> as well.  Furthermore, if the two operands are the same, we can now emit a
> vec_duplicate instead of a vec_concat, leading to better code generation.
> 
> This is backed by the new combine{z,_internal}{,_be} insn patterns, that
> were each split between integral 16- and 32-bit modes (only involving GPRs
> and memory), and the rest (requiring the "w" alternatives as well).
> 
> The effect of the changes is illustrated by the changes to vec-init-23.c,
> introduced in the previous patch (and a handful of other vector-init
> related tests).


Nice!

> 
> gcc/ChangeLog:
> 
>       * config/aarch64/aarch64-simd.md
> (*aarch64_combine_internal<mode>):
>       New insn patterns.
>       (*aarch64_combine_internal_be<mode>): Likewise.
>       (*aarch64_combinez<mode>): Likewise.
>       (*aarch64_combinez_be<mode>): Likewise.
>       (@aarch64_vec_concat<mode>): Support smaller vector and scalar
> modes.
>       * config/aarch64/aarch64.cc (aarch64_expand_vector_init_fallback):
>       Handle the case of two scalar elements.
>       * config/aarch64/iterators.md (SSUB64): New mode iterator.
>       (VSSUB64): Likewise.
>       (VSSUB32_I) : Likewise.
>       (VSSUB64_F): Likewise.
>       (VS32_I_SUB64_F): Likewise.
>       (single_wx): Define attribute for sub-64-bit vector and scalar modes.
>       (VDBL): Likewise.
>       (single_dwx): New mode attribute.
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.target/aarch64/sve/gather_load_10.c: Adjust testcase.
>       * gcc.target/aarch64/sve/slp_1.c: Likewise.
>       * gcc.target/aarch64/vec-init-18.c: Likewise.
>       * gcc.target/aarch64/vec-init-23.c: Likewise.
>       * gcc.target/aarch64/vec-init-single-const.c: Likewise.
> ---
>  gcc/config/aarch64/aarch64-simd.md            | 115 +++++++++++++++++-
>  gcc/config/aarch64/aarch64.cc                 |  19 +--
>  gcc/config/aarch64/iterators.md               |  39 +++++-
>  .../gcc.target/aarch64/sve/gather_load_10.c   |   3 +-
>  gcc/testsuite/gcc.target/aarch64/sve/slp_1.c  |   4 +-
>  .../gcc.target/aarch64/vec-init-18.c          |   7 +-
>  .../gcc.target/aarch64/vec-init-23.c          |  85 ++++++-------
>  .../aarch64/vec-init-single-const.c           |   4 +-
>  8 files changed, 209 insertions(+), 67 deletions(-)
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> index 4bb26621efc..b57d4e29807 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -4814,6 +4814,34 @@
>    }
>  )
> 
> +(define_insn "*aarch64_combine_internal<mode>"
> +  [(set (match_operand:<VDBL> 0 "register_operand")
> +     (vec_concat:<VDBL>
> +       (match_operand:VS32_I_SUB64_F 1 "register_operand")
> +       (match_operand:VS32_I_SUB64_F 2
> "aarch64_simd_nonimmediate_operand")))]
> +  "TARGET_FLOAT
> +   && !BYTES_BIG_ENDIAN"
> +  {@ [ cons: =0 , 1  , 2   ; attrs: type               , arch  ]
> +     [ w        , w  , w   ; neon_permute              , simd  ] 
> uzp1\t%0.<Vdduptype>,
> %1.<Vdduptype>, %2.<Vdduptype>
> +     [ w        , 0  , w   ; neon_move                 , simd  ] 
> mov\t%0.<single_type>[1],
> %2.<single_type>[0]
> +     [ w        , 0  , Utv ; neon_load1_one_lane       , simd  ]
> ld1\t{%0.<single_type>}[1], %2
> +     [ w        , 0  , r   ; neon_from_gp              , simd  ] 
> ins\t%0.<single_type>[1],
> %<single_wx>2
> +     [ ?r       , 0  , r   ; bfm                       , *     ] 
> bfi\t%<single_dwx>0,
> %<single_dwx>2, <bitsize>, <bitsize>
> +  }
> +)
> +
> +(define_insn "*aarch64_combine_internal<mode>"
> +  [(set (match_operand:<VDBL> 0 "register_operand")
> +     (vec_concat:<VDBL>
> +       (match_operand:VSSUB32_I 1 "register_operand")
> +       (match_operand:VSSUB32_I 2
> "aarch64_simd_nonimmediate_operand")))]
> +  "TARGET_FLOAT
> +   && !BYTES_BIG_ENDIAN"
> +  {@ [ cons: =0 , 1  , 2  ; attrs: type               , arch  ]
> +     [ r        , 0  , r  ; bfm                       , *     ] 
> bfi\t%<single_dwx>0,
> %<single_dwx>2, <bitsize>, <bitsize>
> +  }
> +)
> +
>  (define_insn "*aarch64_combine_internal_be<mode>"
>    [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand")
>       (vec_concat:<VDBL>
> @@ -4833,6 +4861,35 @@
>    }
>  )
> 
> +(define_insn "*aarch64_combine_internal_be<mode>"
> +  [(set (match_operand:<VDBL> 0 "register_operand")
> +     (vec_concat:<VDBL>
> +       (match_operand:VS32_I_SUB64_F 2
> "aarch64_simd_nonimmediate_operand")
> +       (match_operand:VS32_I_SUB64_F 1 "register_operand")))]
> +  "TARGET_FLOAT
> +   && BYTES_BIG_ENDIAN"
> +  {@ [ cons: =0 , 1  , 2   ; attrs: type               , arch  ]
> +     [ w        , w  , w   ; neon_permute              , simd  ] 
> uzp1\t%0.<Vdduptype>,
> %1.<Vdduptype>, %2.<Vdduptype>
> +     [ w        , 0  , w   ; neon_move                 , simd  ] 
> mov\t%0.<single_type>[1],
> %2.<single_type>[0]
> +     [ w        , 0  , Utv ; neon_load1_one_lane       , simd  ]
> ld1\t{%0.<single_type>}[1], %2
> +     [ w        , 0  , r   ; neon_from_gp              , simd  ] 
> ins\t%0.<single_type>[1],
> %<single_wx>2
> +     [ ?r       , 0  , r   ; bfm                       , *     ] 
> bfi\t%<single_dwx>0,
> %<single_dwx>2, <bitsize>, <bitsize>
> +  }
> +)
> +
> +(define_insn "*aarch64_combine_internal_be<mode>"
> +  [(set (match_operand:<VDBL> 0 "register_operand")
> +     (vec_concat:<VDBL>
> +       (match_operand:VSSUB32_I 2
> "aarch64_simd_nonimmediate_operand")
> +       (match_operand:VSSUB32_I 1 "register_operand")))]
> +  "TARGET_FLOAT
> +   && BYTES_BIG_ENDIAN"
> +  {@ [ cons: =0 , 1  , 2  ; attrs: type               , arch  ]
> +     [ r        , 0  , r  ; bfm                       , *     ] 
> bfi\t%<single_dwx>0,
> %<single_dwx>2, <bitsize>, <bitsize>
> +  }
> +)
> +
> +
>  ;; In this insn, operand 1 should be low, and operand 2 the high part of the
>  ;; dest vector.
> 
> @@ -4849,6 +4906,33 @@
>    }
>  )
> 
> +(define_insn "*aarch64_combinez<mode>"
> +  [(set (match_operand:<VDBL> 0 "register_operand")
> +     (vec_concat:<VDBL>
> +          (match_operand:VSSUB32_I 1 "nonimmediate_operand")
> +       (match_operand:VSSUB32_I 2
> "aarch64_simd_or_scalar_imm_zero")))]
> +  "TARGET_FLOAT && !BYTES_BIG_ENDIAN"
> +  {@ [ cons: =0 , 1  ; attrs: type      ]
> +     [ r        , r  ; mov_reg          ] uxt<size>\t%w0, %w1
> +     [ r        , m  ; load_4           ] ldr<size>\t%<single_wx>0, %1
> +  }
> +)
> +
> +(define_insn "*aarch64_combinez<mode>"
> +  [(set (match_operand:<VDBL> 0 "register_operand")
> +     (vec_concat:<VDBL>
> +          (match_operand:VS32_I_SUB64_F 1 "nonimmediate_operand")
> +       (match_operand:VS32_I_SUB64_F 2
> "aarch64_simd_or_scalar_imm_zero")))]
> +  "TARGET_FLOAT && !BYTES_BIG_ENDIAN"
> +  {@ [ cons: =0 , 1  ; attrs: type      ]
> +     [ w        , w  ; neon_move        ] fmov\t%<single_type>0, 
> %<single_type>1
> +     [ w        , r  ; neon_from_gp     ] fmov\t%<single_type>0, 
> %<single_wx>1
> +     [ w        , m  ; neon_load1_1reg  ] ldr\t%<single_type>0, %1
> +     [ r        , r  ; mov_reg          ] uxtw\t%x0, %w1
> +     [ r        , m  ; load_4           ] ldr<size>\t%<single_wx>0, %1
> +  }
> +)
> +
>  (define_insn "*aarch64_combinez_be<mode>"
>    [(set (match_operand:<VDBL> 0 "register_operand")
>          (vec_concat:<VDBL>
> @@ -4862,14 +4946,41 @@
>    }
>  )
> 
> +(define_insn "*aarch64_combinez_be<mode>"
> +  [(set (match_operand:<VDBL> 0 "register_operand")
> +     (vec_concat:<VDBL>
> +       (match_operand:VSSUB32_I 2 "aarch64_simd_or_scalar_imm_zero")
> +          (match_operand:VSSUB32_I 1 "nonimmediate_operand")))]
> +  "TARGET_FLOAT && BYTES_BIG_ENDIAN"
> +  {@ [ cons: =0 , 1  ; attrs: type      ]
> +     [ r        , r  ; mov_reg          ] uxt<size>\t%w0, %w1
> +     [ r        , m  ; load_4           ] ldr<size>\t%<single_wx>0, %1
> +  }
> +)
> +
> +(define_insn "*aarch64_combinez_be<mode>"
> +  [(set (match_operand:<VDBL> 0 "register_operand")
> +     (vec_concat:<VDBL>
> +       (match_operand:VS32_I_SUB64_F 2
> "aarch64_simd_or_scalar_imm_zero")
> +          (match_operand:VS32_I_SUB64_F 1 "nonimmediate_operand")))]
> +  "TARGET_FLOAT && BYTES_BIG_ENDIAN"
> +  {@ [ cons: =0 , 1  ; attrs: type      ]
> +     [ w        , w  ; neon_move        ] fmov\t%<single_type>0, 
> %<single_type>1
> +     [ w        , r  ; neon_from_gp     ] fmov\t%<single_type>0, 
> %<single_wx>1
> +     [ w        , m  ; neon_load1_1reg  ] ldr\t%<single_type>0, %1
> +     [ r        , r  ; mov_reg          ] uxtw\t%x0, %w1
> +     [ r        , m  ; load_4           ] ldr<size>\t%<single_wx>0, %1
> +  }
> +)
> +
>  ;; Form a vector whose first half (in array order) comes from operand 1
>  ;; and whose second half (in array order) comes from operand 2.
>  ;; This operand order follows the RTL vec_concat operation.
>  (define_expand "@aarch64_vec_concat<mode>"
>    [(set (match_operand:<VDBL> 0 "register_operand")
>       (vec_concat:<VDBL>
> -       (match_operand:VDCSIF 1 "general_operand")
> -       (match_operand:VDCSIF 2 "general_operand")))]
> +       (match_operand:VDUP 1 "general_operand")
> +       (match_operand:VDUP 2 "general_operand")))]
>    "TARGET_FLOAT"
>  {
>    int lo = BYTES_BIG_ENDIAN ? 2 : 1;
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 5b1afa50ff8..f08cf032708 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -25502,19 +25502,24 @@ aarch64_expand_vector_init_fallback (rtx
> target, rtx vals)
>    rtx v0 = XVECEXP (vals, 0, 0);
>    bool all_same = true;
> 
> -  /* This is a special vec_init<M><N> where N is not an element mode but a
> +  /* This is a special vec_init<M><N> where N is either an element mode or a
>       vector mode with half the elements of M.  We expect to find two entries
>       of mode N in VALS and we must put their concatentation into TARGET.  */
> -  if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP
> (vals, 0, 0))))
> +  if (XVECLEN (vals, 0) == 2 && GET_MODE (v0) != VOIDmode)

I don't think we want such a wide relaxation as that would allow things like
COMPLEX_MODE.

So perhaps just VECTOR_MODE_P || SCALAR_INT_MODE_P || SCALAR_FLOAT_MODE_P ?

Patch is OK with that change.

Thanks,
Tamar

>      {
> -      machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
> +      rtx v1 = XVECEXP (vals, 0, 1);
> +      machine_mode narrow_mode = GET_MODE (v0);
>        gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
>                 && known_eq (GET_MODE_SIZE (mode),
>                              2 * GET_MODE_SIZE (narrow_mode)));
> -      emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
> -                                      XVECEXP (vals, 0, 0),
> -                                      XVECEXP (vals, 0, 1)));
> -     return;
> +      if (rtx_equal_p (v0, v1))
> +       aarch64_emit_move (target,
> +                       gen_vec_duplicate (mode,
> +                                          force_reg (narrow_mode, v0)));
> +      else
> +       emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
> +                                       v0, v1));
> +      return;
>     }
> 
>    /* Count the number of variable elements to initialise.  */
> diff --git a/gcc/config/aarch64/iterators.md
> b/gcc/config/aarch64/iterators.md
> index 1fc67d95bd4..eafb8f45a1b 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -234,6 +234,21 @@
>  ;; All sub-64-bit vector modes.
>  (define_mode_iterator VSUB64 [V2QI V4QI V2HI V2HF V2BF])
> 
> +;; All sub-64-bit scalar modes.
> +(define_mode_iterator SSUB64 [QI HI HF BF SI SF])
> +
> +;; All sub-64-bit modes.
> +(define_mode_iterator VSSUB64 [VSUB64 SSUB64])
> +
> +;; All sub-32-bit integer modes.
> +(define_mode_iterator VSSUB32_I [V2QI QI HI])
> +
> +;; All sub-64-bit floating-point modes.
> +(define_mode_iterator VSSUB64_F [V2HF V2BF HF BF])
> +
> +;; All 32-bit integer and sub-64-bit floating point modes.
> +(define_mode_iterator VS32_I_SUB64_F [V4QI V2HI VSSUB64_F])
> +
>  ;; All Advanced SIMD modes suitable for moving, loading, and storing.
>  (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
>                               V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
> @@ -1471,7 +1486,13 @@
>  (define_mode_attr bitsize [(V8QI "64") (V16QI "128")
>                          (V4HI "64") (V8HI "128")
>                          (V2SI "64") (V4SI "128")
> -                        (V1DI "64") (V2DI "128")])
> +                        (V1DI "64") (V2DI "128")
> +                        (QI "8") (V2QI "16")
> +                        (V4QI "32") (HI "16")
> +                        (HF "16") (BF "16")
> +                        (SI "32") (SF "32")
> +                        (V2HI "32") (V2HF "32")
> +                        (V2BF "32")])
> 
>  ;; Map a floating point or integer mode to the appropriate register name 
> prefix
>  (define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") (DI "d")])
> @@ -1970,10 +1991,16 @@
>  (define_mode_attr V1half [(V2DI "v1di")  (V2DF  "v1df")])
> 
>  ;; Double modes of vector modes.
> -(define_mode_attr VDBL [(V8QI "V16QI") (V4HI "V8HI")
> +(define_mode_attr VDBL [(V8QI "V16QI") (V4QI "V8QI")
> +                     (V2QI "V4QI")  (V4HI "V8HI")
>                       (V4HF "V8HF")  (V4BF "V8BF")
> +                     (V2BF "V4BF")
>                       (V2SI "V4SI")  (V2SF "V4SF")
> +                     (V2HI "V4HI")  (V2HF "V4HF")
> +                     (BF   "V2BF")
>                       (SI   "V2SI")  (SF   "V2SF")
> +                     (QI   "V2QI")
> +                     (HI   "V2HI")  (HF   "V2HF")
>                       (DI   "V2DI")  (DF   "V2DF")])
> 
>  ;; Load/store pair mode.
> @@ -2201,6 +2228,14 @@
>                            (V2SI "x") (V2SF "x")
>                            (DI   "x") (DF   "x")])
> 
> +(define_mode_attr single_dwx [(SI  "x") (SF   "x")
> +                          (V2QI "w") (V4QI "x")
> +                          (V2HI "x") (V2HF "x")
> +                          (HF   "w") (QI   "w")
> +                          (V2BF "x") (BF   "w")
> +                          (HI   "w")])
> +
> +
>  ;; Whether a mode fits in S or D registers (i.e. "s" for 32-bit modes
>  ;; and "d" for 64-bit modes).
>  (define_mode_attr single_type [(SI   "s") (SF   "s")
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_10.c
> b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_10.c
> index 2a07c0be866..75283d355ae 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_10.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_10.c
> @@ -11,7 +11,8 @@ foo (uint64_t *restrict x, uint64_t *restrict y, uint64_t
> *restrict index)
>      x[i] += y[index[i]];
>  }
> 
> -/* { dg-final { scan-assembler-times {\tldr\td[0-9]+, \[x[0-9]+, x[0-9]+, lsl
> #?3\]} 2 } } */
> +/* { dg-final { scan-assembler-times {\tldr\td[0-9]+, \[x[0-9]+, x[0-9]+, lsl
> #?3\]} 1 } } */
> +/* { dg-final { scan-assembler-times {\tld1\t{v[0-9]+\.d}\[1\], \[x[0-9]+\]} 
> 1 }
> } */
>  /* { dg-final { scan-assembler-not {\tshl\tv[0-9]+\.2d,} } } */
>  /* { dg-final { scan-assembler-not {\tumov\t} } } */
>  /* { dg-final { scan-assembler {\tadd\tv[0-9]+\.2d,} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> index 2bb2c04fa20..1fbb08c7566 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
> @@ -30,12 +30,12 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int
> n)    \
>  TEST_ALL (VEC_PERM)
> 
>  /* We should use one DUP for each of the 8-, 16- and 32-bit types,
> -   (for now, insert both elements with ins for _Float16).  We should use two
> +   (and we now use fmov + ins for _Float16).  We should use two
>     DUPs for each of the three 64-bit types.  */
>  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } } */
>  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 3 } } */
>  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } } */
> -/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[0\], 
> v[0-9]+\.h\[0\]}
> 1 } } */
> +/* { dg-final { scan-assembler-times {\tfmov\th[0-9]+, h} 1 } } */
>  /* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[1\], 
> v[0-9]+\.h\[0\]}
> 1 } } */
>  /* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-
> 9]+\.d\n} 3 } } */
>  /* { dg-final { scan-assembler-not {\tzip2\t} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> index feeb181a0b5..394537c80d8 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> @@ -15,7 +15,6 @@ int16x8_t foo2(int16_t x)
>    return v;
>  }
> 
> -/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4s, w[0-9]+} 2 } } */
> -/* { dg-final { scan-assembler-times {\tmov\tw[0-9]+, 65537} 1 } } */
> -/* { dg-final { scan-assembler-times {\tbfi\tw[0-9]+, w[0-9]+, 0, 16} 1 } } 
> */
> -/* { dg-final { scan-assembler-times {\tbfi\tw[0-9]+, w[0-9]+, 16, 16} 1 } } 
> */
> +/* { dg-final { scan-assembler-times {\tmov\tw1, 1} 1 } } */
> +/* { dg-final { scan-assembler-times {\tdup\tv0+\.4s, w0} 2 } } */
> +/* { dg-final { scan-assembler-times {\tbfi\tw0, w1, 16, 16} 2 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-23.c
> b/gcc/testsuite/gcc.target/aarch64/vec-init-23.c
> index 595470b29fb..217838ea55a 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vec-init-23.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-23.c
> @@ -111,9 +111,8 @@ TEST_64(int, int64_t, s)
> 
>  /*
>  ** test_int8_5:
> -**   mov     w1, 0
> -**   bfi     w1, w0, 0, 8
> -**   dup     v0\.8h, w1
> +**   uxtb    w0, w0
> +**   dup     v0\.8h, w0
>  **   ret
>  */
> 
> @@ -217,7 +216,7 @@ TEST_64(int, int64_t, s)
>  ** test_float16_2:
>  **   fcvt    h1, s1
>  **   fcvt    h0, s0
> -**   ins     v0\.h\[1\], v1\.h\[0\]
> +**   uzp1    v0\.4h, v0\.4h, v1\.4h
>  **   dup     v0\.4s, v0\.s\[0\]
>  **   ret
>  */
> @@ -227,55 +226,51 @@ TEST_64(int, int64_t, s)
>  **   uzp1    v2\.2s, v0\.2s, v2\.2s
>  **   uzp1    v3\.2s, v1\.2s, v3\.2s
>  **   zip1    v3\.4s, v2\.4s, v3\.4s
> -**   fcvtn   v0\.4h, v3\.4s
> -**   uzp1    v0\.2d, v0\.2d, v0\.2d
> +**   fcvtn   v3\.4h, v3\.4s
> +**   dup     v0\.2d, v3\.d\[0\]
>  **   ret
>  */
> 
>  /*
>  ** test_float16_4:
>  **   fcvt    h0, s0
> -**   movi    v31\.2d, #0
> -**   ins     v31\.h\[0\], v0\.h\[0\]
> -**   dup     v0\.4s, v31\.s\[0\]
> +**   fmov    h0, h0
> +**   dup     v0\.4s, v0\.s\[0\]
>  **   ret
>  */
> 
>  /*
>  ** test_float16_5:
> +**   movi    v31\.4h, #0
>  **   fcvt    h0, s0
> -**   movi    v31\.2d, #0
> -**   ins     v31\.h\[1\], v0\.h\[0\]
> -**   dup     v0\.4s, v31\.s\[0\]
> +**   uzp1    v0\.4h, v31\.4h, v0\.4h
> +**   dup     v0\.4s, v0\.s\[0\]
>  **   ret
>  */
> 
>  /*
>  ** test_float16_6:
> -**   fcvt    h1, s1
>  **   fcvt    h0, s0
> -**   movi    v31\.2d, #0
> -**   mov     w0, 1006648320
> -**   umov    w1, v1\.h\[0\]
> -**   ins     v31\.h\[0\], v0\.h\[0\]
> -**   bfi     w0, w1, 0, 16
> -**   dup     v31\.2s, v31\.s\[0\]
> -**   dup     v0\.2s, w0
> -**   zip1    v0\.8h, v31\.8h, v0\.8h
> +**   fcvt    h1, s1
> +**   fmov    h31, 1.0e\+0
> +**   fmov    h0, h0
> +**   uzp1    v1\.4h, v1\.4h, v31\.4h
> +**   dup     v0\.2s, v0\.s\[0\]
> +**   dup     v1\.2s, v1\.s\[0\]
> +**   zip1    v0\.8h, v0\.8h, v1\.8h
>  **   ret
>  */
> 
>  /*
>  ** test_float16_7:
> -**   fcvt    h1, s1
>  **   fcvt    h0, s0
> -**   movi    v31\.2d, #0
> -**   mov     w0, 1006648320
> -**   umov    w1, v1\.h\[0\]
> -**   ins     v31\.h\[1\], v0\.h\[0\]
> -**   bfi     w0, w1, 16, 16
> +**   movi    v31\.4h, #0
> +**   fcvt    h1, s1
> +**   uzp1    v31\.4h, v31\.4h, v0\.4h
> +**   fmov    h0, 1.0e\+0
> +**   uzp1    v0\.4h, v0\.4h, v1\.4h
>  **   dup     v31\.2s, v31\.s\[0\]
> -**   dup     v0\.2s, w0
> +**   dup     v0\.2s, v0\.s\[0\]
>  **   zip1    v0\.8h, v31\.8h, v0\.8h
>  **   ret
>  */
> @@ -285,7 +280,7 @@ TEST_64(int, int64_t, s)
>  **   fcvt    h1, s1
>  **   fcvt    h0, s0
>  **   movi    v31\.2s, 0x3c, lsl 24
> -**   ins     v0\.h\[1\], v1\.h\[0\]
> +**   uzp1    v0\.4h, v0\.4h, v1\.4h
>  **   dup     v0\.2s, v0\.s\[0\]
>  **   zip1    v0\.8h, v31\.8h, v0\.8h
>  **   ret
> @@ -316,9 +311,8 @@ TEST_64(int, int64_t, s)
> 
>  /*
>  ** test_int16_4:
> -**   mov     w1, 0
> -**   bfi     w1, w0, 0, 16
> -**   dup     v0\.4s, w1
> +**   uxth    w0, w0
> +**   dup     v0\.4s, w0
>  **   ret
>  */
> 
> @@ -332,12 +326,11 @@ TEST_64(int, int64_t, s)
> 
>  /*
>  ** test_int16_6:
> -**   mov     w2, 0
> -**   bfi     w2, w0, 0, 16
> -**   mov     w0, 65537
> -**   bfi     w0, w1, 0, 16
> -**   dup     v31\.2s, w2
> -**   dup     v0\.2s, w0
> +**   uxth    w0, w0
> +**   dup     v31\.2s, w0
> +**   mov     w0, 1
> +**   bfi     w1, w0, 16, 16
> +**   dup     v0\.2s, w1
>  **   zip1    v0\.8h, v31\.8h, v0\.8h
>  **   ret
>  */
> @@ -378,17 +371,16 @@ TEST_64(int, int64_t, s)
> 
>  /*
>  ** test_float32_3:
> -**   movi    v31\.2s, 0
> -**   dup     v0\.2s, v0\.s\[0\]
> -**   zip1    v0\.4s, v0\.4s, v31\.4s
> +**   fmov    s0, s0
> +**   dup     v0\.2d, v0\.d\[0\]
>  **   ret
>  */
> 
>  /*
>  ** test_float32_4:
> -**   movi    v31\.2s, 0
> -**   dup     v0\.2s, v0\.s\[0\]
> -**   zip1    v0\.4s, v31\.4s, v0\.4s
> +**   movi    v31\.2s, #0
> +**   uzp1    v0\.2s, v31\.2s, v0\.2s
> +**   dup     v0\.2d, v0\.d\[0\]
>  **   ret
>  */
> 
> @@ -408,9 +400,8 @@ TEST_64(int, int64_t, s)
> 
>  /*
>  ** test_int32_3:
> -**   dup     v31\.2s, w0
> -**   movi    v0\.2s, 0
> -**   zip1    v0\.4s, v31\.4s, v0\.4s
> +**   fmov    s0, w0
> +**   dup     v0\.2d, v0\.d\[0\]
>  **   ret
>  */
> 
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c
> b/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c
> index 587b7ec0e3b..98f75336d86 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c
> @@ -47,8 +47,8 @@ int32x4_t f_s32(int32_t x)
>  /*
>  ** f_s64:
>  **   fmov    d0, x0
> -**   mov     (x[0-9]+), 1
> -**   ins     v0\.d\[1\], \1
> +**   mov     x0, 1
> +**   ins     v0\.d\[1\], x0
>  **   ret
>  */
> 
> --
> 2.43.0

RE: [PATCH 3/4] aarch64: implement vec_concat support for sub-64-bit types

Reply via email to