Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes:

> On Tue, 2 May 2023 at 18:22, Richard Sandiford
> <richard.sandif...@arm.com> wrote:
>>
>> Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes:
>> > On Tue, 2 May 2023 at 17:32, Richard Sandiford
>> > <richard.sandif...@arm.com> wrote:
>> >>
>> >> Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes:
>> >> > On Tue, 2 May 2023 at 14:56, Richard Sandiford
>> >> > <richard.sandif...@arm.com> wrote:
>> >> >> > [aarch64] Improve code-gen for vector initialization with single 
>> >> >> > constant element.
>> >> >> >
>> >> >> > gcc/ChangeLog:
>> >> >> >       * config/aarch64/aarc64.cc (aarch64_expand_vector_init): Tweak 
>> >> >> > condition
>> >> >> >       if (n_var == n_elts && n_elts <= 16) to allow a single 
>> >> >> > constant,
>> >> >> >       and if maxv == 1, use constant element for duplicating into 
>> >> >> > register.
>> >> >> >
>> >> >> > gcc/testsuite/ChangeLog:
>> >> >> >       * gcc.target/aarch64/vec-init-single-const.c: New test.
>> >> >> >
>> >> >> > diff --git a/gcc/config/aarch64/aarch64.cc 
>> >> >> > b/gcc/config/aarch64/aarch64.cc
>> >> >> > index 2b0de7ca038..f46750133a6 100644
>> >> >> > --- a/gcc/config/aarch64/aarch64.cc
>> >> >> > +++ b/gcc/config/aarch64/aarch64.cc
>> >> >> > @@ -22167,7 +22167,7 @@ aarch64_expand_vector_init (rtx target, rtx 
>> >> >> > vals)
>> >> >> >       and matches[X][1] with the count of duplicate elements (if X 
>> >> >> > is the
>> >> >> >       earliest element which has duplicates).  */
>> >> >> >
>> >> >> > -  if (n_var == n_elts && n_elts <= 16)
>> >> >> > +  if ((n_var >= n_elts - 1) && n_elts <= 16)
>> >> >> >      {
>> >> >> >        int matches[16][2] = {0};
>> >> >> >        for (int i = 0; i < n_elts; i++)
>> >> >> > @@ -22227,6 +22227,18 @@ aarch64_expand_vector_init (rtx target, rtx 
>> >> >> > vals)
>> >> >> >            vector register.  For big-endian we want that position to 
>> >> >> > hold
>> >> >> >            the last element of VALS.  */
>> >> >> >         maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
>> >> >> > +
>> >> >> > +       /* If we have a single constant element, use that for 
>> >> >> > duplicating
>> >> >> > +          instead.  */
>> >> >> > +       if (n_var == n_elts - 1)
>> >> >> > +         for (int i = 0; i < n_elts; i++)
>> >> >> > +           if (CONST_INT_P (XVECEXP (vals, 0, i))
>> >> >> > +               || CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
>> >> >> > +             {
>> >> >> > +               maxelement = i;
>> >> >> > +               break;
>> >> >> > +             }
>> >> >> > +
>> >> >> >         rtx x = force_reg (inner_mode, XVECEXP (vals, 0, 
>> >> >> > maxelement));
>> >> >> >         aarch64_emit_move (target, lowpart_subreg (mode, x, 
>> >> >> > inner_mode));
>> >> >>
>> >> >> We don't want to force the constant into a register though.
>> >> > OK right, sorry.
>> >> > With the attached patch, for the following test-case:
>> >> > int64x2_t f_s64(int64_t x)
>> >> > {
>> >> >   return (int64x2_t) { x, 1 };
>> >> > }
>> >> >
>> >> > it loads constant from memory (same code-gen as without patch).
>> >> > f_s64:
>> >> >         adrp    x1, .LC0
>> >> >         ldr     q0, [x1, #:lo12:.LC0]
>> >> >         ins     v0.d[0], x0
>> >> >         ret
>> >> >
>> >> > Does the patch look OK ?
>> >> >
>> >> > Thanks,
>> >> > Prathamesh
>> >> > [...]
>> >> > [aarch64] Improve code-gen for vector initialization with single 
>> >> > constant element.
>> >> >
>> >> > gcc/ChangeLog:
>> >> >       * config/aarch64/aarc64.cc (aarch64_expand_vector_init): Tweak 
>> >> > condition
>> >> >       if (n_var == n_elts && n_elts <= 16) to allow a single constant,
>> >> >       and if maxv == 1, use constant element for duplicating into 
>> >> > register.
>> >> >
>> >> > gcc/testsuite/ChangeLog:
>> >> >       * gcc.target/aarch64/vec-init-single-const.c: New test.
>> >> >
>> >> > diff --git a/gcc/config/aarch64/aarch64.cc 
>> >> > b/gcc/config/aarch64/aarch64.cc
>> >> > index 2b0de7ca038..97309ddec4f 100644
>> >> > --- a/gcc/config/aarch64/aarch64.cc
>> >> > +++ b/gcc/config/aarch64/aarch64.cc
>> >> > @@ -22167,7 +22167,7 @@ aarch64_expand_vector_init (rtx target, rtx 
>> >> > vals)
>> >> >       and matches[X][1] with the count of duplicate elements (if X is 
>> >> > the
>> >> >       earliest element which has duplicates).  */
>> >> >
>> >> > -  if (n_var == n_elts && n_elts <= 16)
>> >> > +  if ((n_var >= n_elts - 1) && n_elts <= 16)
>> >>
>> >> No need for the extra brackets.
>> > Adjusted, thanks. Sorry if this sounds like a silly question, but why
>> > do we need the n_elts <= 16 check ?
>> > Won't n_elts be always <= 16 since max number of elements in a vector
>> > would be 16 for V16QI ?
>>
>> Was wondering the same thing :)
>>
>> Let's leave it though.
>>
>> >> >      {
>> >> >        int matches[16][2] = {0};
>> >> >        for (int i = 0; i < n_elts; i++)
>> >> > @@ -22227,8 +22227,26 @@ aarch64_expand_vector_init (rtx target, rtx 
>> >> > vals)
>> >> >            vector register.  For big-endian we want that position to 
>> >> > hold
>> >> >            the last element of VALS.  */
>> >> >         maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
>> >> > -       rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
>> >> > -       aarch64_emit_move (target, lowpart_subreg (mode, x, 
>> >> > inner_mode));
>> >> > +
>> >> > +       /* If we have a single constant element, use that for 
>> >> > duplicating
>> >> > +          instead.  */
>> >> > +       if (n_var == n_elts - 1)
>> >> > +         for (int i = 0; i < n_elts; i++)
>> >> > +           if (CONST_INT_P (XVECEXP (vals, 0, i))
>> >> > +               || CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
>> >> > +             {
>> >> > +               maxelement = i;
>> >> > +               break;
>> >> > +             }
>> >> > +
>> >> > +       rtx maxval = XVECEXP (vals, 0, maxelement);
>> >> > +       if (!(CONST_INT_P (maxval) || CONST_DOUBLE_P (maxval)))
>> >> > +         {
>> >> > +           rtx x = force_reg (inner_mode, XVECEXP (vals, 0, 
>> >> > maxelement));
>> >> > +           aarch64_emit_move (target, lowpart_subreg (mode, x, 
>> >> > inner_mode));
>> >> > +         }
>> >> > +       else
>> >> > +         aarch64_emit_move (target, gen_vec_duplicate (mode, maxval));
>> >> >       }
>> >> >        else
>> >> >       {
>> >>
>> >> This seems a bit convoluted.  It might be easier to record whether
>> >> we see a CONST_INT_P or a CONST_DOUBLE_P during the previous loop,
>> >> and if so what the constant is.  Then handle that case first,
>> >> as a separate arm of the "if".
>> > Adjusted in the attached patch. Does it look OK ?
>>
>> I meant: adjust
>>
>>       int maxelement = 0;
>>       int maxv = 0;
>>       for (int i = 0; i < n_elts; i++)
>>         if (matches[i][1] > maxv)
>>           {
>>             maxelement = i;
>>             maxv = matches[i][1];
>>           }
>>
>> so that it also records any CONST_INT or CONST_DOUBLE (as an rtx).
> Oh right. Adjusted in the attached patch, but I also added
> const_elem_pos to keep track of the position,
> to set maxelement to it since it's later used to skip duplicated element here:
>
>     /* Insert the rest.  */
>       for (int i = 0; i < n_elts; i++)
>         {
>           rtx x = XVECEXP (vals, 0, i);
>           if (matches[i][0] == maxelement)
>             continue;
>           x = force_reg (inner_mode, x);
>           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
>         }
>       return;
>
> Does that look OK ?

Yeah, looks good.

>> >> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c 
>> >> > b/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c
>> >> > new file mode 100644
>> >> > index 00000000000..682fd43439a
>> >> > --- /dev/null
>> >> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c
>> >> > @@ -0,0 +1,66 @@
>> >> > +/* { dg-do compile } */
>> >> > +/* { dg-options "-O2" } */
>> >> > +/* { dg-final { check-function-bodies "**" "" "" } } */
>> >> > +
>> >> > +#include <arm_neon.h>
>> >> > +
>> >> > +/*
>> >> > +** f_s8:
>> >> > +**   ...
>> >> > +**   dup     v[0-9]+\.16b, w[0-9]+
>> >> > +**   movi    v[0-9]+\.8b, 0x1
>> >> > +**   ins     v[0-9]+\.b\[15\], v[0-9]+\.b\[0\]
>> >> > +**   ...
>> >> > +**   ret
>> >>
>> >> Like with the divide-and-conquer patch, there's nothing that requires
>> >> the first two instructions to be in that order.
>> > Hmm, will it be OK to disable scheduling by passing
>> > -fno-schedule-insns -fno-schedule-insns2
>> > for the test ?
>>
>> Guess we might as well try that for now.
>>
>> Elsewhere I've used:
>>
>>   (
>>      first sequence
>>   |
>>      second sequence
>>   )
>>      common part
>>
>> but we probably have enough control over the unscheduled sequence
>> for that not to be necessary here.
>>
>> >> What is the second ... hiding?  What sequences do we actually generate?
>> > Sorry, added them by mistake. They were the exact sequences. Adjusted
>> > tests in the patch.
>> >>
>> >> BTW, remember to say how patches were tested :-)
>> > Right, sorry. The patch is under bootstrap+test on aarch64-linux-gnu.
>>
>> Please also test the new tests on big-endian.
> Done, thanks.
>>
>> > +/*
>> > +** f_s8:
>> > +**   dup     v[0-9]+\.16b, w[0-9]+
>>
>> Without the ...s, this must be v0 and w0 respectively
>>
>> > +**   movi    v[0-9]+\.8b, 0x1
>>
>> Would be good to capture the register number here and use \1 in the
>> following line.
>>
>> > +**   ins     v[0-9]+\.b\[15\], v[0-9]+\.b\[0\]
>>
>> Similarly v0 for the first operand here.
> Done, thanks.
> I verified the big-endian test passes on aarch64_be-linux-gnu, and
> patch is under bootstrap+test on aarch64-linux-gnu.
> OK to commit if passes ?

OK, thanks.

Richard

>
> Thanks,
> Prathamesh
>>
>> Thanks,
>> Richard
>>
>> > +**   ret
>> > +*/
>> > +
>> > +int8x16_t f_s8(int8_t x)
>> > +{
>> > +  return (int8x16_t) { x, x, x, x, x, x, x, x,
>> > +                       x, x, x, x, x, x, x, 1 };
>> > +}
>> > +
>> > +/*
>> > +** f_s16:
>> > +**   dup     v[0-9]+\.8h, w[0-9]+
>> > +**   movi    v[0-9]+\.4h, 0x1
>> > +**   ins     v[0-9]+\.h\[7\], v[0-9]+\.h\[0\]
>> > +**   ret
>> > +*/
>> > +
>> > +int16x8_t f_s16(int16_t x)
>> > +{
>> > +  return (int16x8_t) { x, x, x, x, x, x, x, 1 };
>> > +}
>> > +
>> > +/*
>> > +** f_s32:
>> > +**   dup     v[0-9]\.4s, w[0-9]+
>> > +**   movi    v[0-9]\.2s, 0x1
>> > +**   ins     v[0-9]+\.s\[3\], v[0-9]+\.s\[0\]
>> > +**   ret
>> > +*/
>> > +
>> > +int32x4_t f_s32(int32_t x)
>> > +{
>> > +  return (int32x4_t) { x, x, x, 1 };
>> > +}
>> > +
>> > +/*
>> > +** f_s64:
>> > +**   adrp    x[0-9]+, .LC[0-9]+
>> > +**   ldr     q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
>> > +**   ins     v[0-9]+\.d\[0\], x[0-9]+
>> > +**   ret
>> > +*/
>> > +
>> > +int64x2_t f_s64(int64_t x)
>> > +{
>> > +  return (int64x2_t) { x, 1 };
>> > +}
>
> [aarch64] Improve code-gen for vector initialization with single constant 
> element.
>
> gcc/ChangeLog:
>       * config/aarch64/aarc64.cc (aarch64_expand_vector_init): Tweak condition
>       if (n_var == n_elts && n_elts <= 16) to allow a single constant,
>       and if maxv == 1, use constant element for duplicating into register.
>
> gcc/testsuite/ChangeLog:
>       * gcc.target/aarch64/vec-init-single-const.c: New test.
>       * gcc.target/aarch64/vec-init-single-const-be.c: Likewise.
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 2b0de7ca038..1ae8cf530e9 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -22167,7 +22167,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>       and matches[X][1] with the count of duplicate elements (if X is the
>       earliest element which has duplicates).  */
>  
> -  if (n_var == n_elts && n_elts <= 16)
> +  if (n_var >= n_elts - 1 && n_elts <= 16)
>      {
>        int matches[16][2] = {0};
>        for (int i = 0; i < n_elts; i++)
> @@ -22184,12 +22184,23 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>       }
>        int maxelement = 0;
>        int maxv = 0;
> +      rtx const_elem = NULL_RTX;
> +      int const_elem_pos = 0;
> +
>        for (int i = 0; i < n_elts; i++)
> -     if (matches[i][1] > maxv)
> -       {
> -         maxelement = i;
> -         maxv = matches[i][1];
> -       }
> +     {
> +       if (matches[i][1] > maxv)
> +         {
> +           maxelement = i;
> +           maxv = matches[i][1];
> +         }
> +       if (CONST_INT_P (XVECEXP (vals, 0, i))
> +           || CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
> +         {
> +           const_elem_pos = i; 
> +           const_elem = XVECEXP (vals, 0, i);
> +         }
> +     }
>  
>        /* Create a duplicate of the most common element, unless all elements
>        are equally useless to us, in which case just immediately set the
> @@ -22227,8 +22238,19 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>            vector register.  For big-endian we want that position to hold
>            the last element of VALS.  */
>         maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
> -       rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
> -       aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
> +
> +       /* If we have a single constant element, use that for duplicating
> +          instead.  */
> +       if (const_elem)
> +         {
> +           maxelement = const_elem_pos;
> +           aarch64_emit_move (target, gen_vec_duplicate (mode, const_elem));
> +         }
> +       else
> +         {
> +           rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
> +           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
> +         }
>       }
>        else
>       {
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-single-const-be.c 
> b/gcc/testsuite/gcc.target/aarch64/vec-init-single-const-be.c
> new file mode 100644
> index 00000000000..f84befa4c11
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-single-const-be.c
> @@ -0,0 +1,58 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { be } } } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** f_s8:
> +**   dup     v0.16b, w0
> +**   movi    (v[0-9]+)\.8b, 0x1
> +**   ins     v0.b\[0\], \1\.b\[0\]
> +**   ret
> +*/
> +
> +int8x16_t f_s8(int8_t x)
> +{
> +  return (int8x16_t) { x, x, x, x, x, x, x, x,
> +                       x, x, x, x, x, x, x, 1 };
> +}
> +
> +/*
> +** f_s16:
> +**   dup     v0.8h, w0
> +**   movi    (v[0-9]+)\.4h, 0x1
> +**   ins     v0.h\[0\], \1\.h\[0\]
> +**   ret
> +*/
> +
> +int16x8_t f_s16(int16_t x)
> +{
> +  return (int16x8_t) { x, x, x, x, x, x, x, 1 };
> +}
> +
> +/*
> +** f_s32:
> +**   dup     v0.4s, w0
> +**   movi    (v[0-9])\.2s, 0x1
> +**   ins     v0.s\[0\], \1\.s\[0\]
> +**   ret
> +*/
> +
> +int32x4_t f_s32(int32_t x)
> +{
> +  return (int32x4_t) { x, x, x, 1 };
> +}
> +
> +/*
> +** f_s64:
> +**   adrp    x[0-9]+, .LC[0-9]+
> +**   ldr     q0, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> +**   ins     v0\.d\[1\], x0
> +**   ret
> +*/
> +
> +int64x2_t f_s64(int64_t x)
> +{
> +  return (int64x2_t) { x, 1 };
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c 
> b/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c
> new file mode 100644
> index 00000000000..f736bfc3b68
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c
> @@ -0,0 +1,58 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** f_s8:
> +**   dup     v0.16b, w0
> +**   movi    (v[0-9]+)\.8b, 0x1
> +**   ins     v0.b\[15\], \1\.b\[0\]
> +**   ret
> +*/
> +
> +int8x16_t f_s8(int8_t x)
> +{
> +  return (int8x16_t) { x, x, x, x, x, x, x, x,
> +                       x, x, x, x, x, x, x, 1 };
> +}
> +
> +/*
> +** f_s16:
> +**   dup     v0.8h, w0
> +**   movi    (v[0-9]+)\.4h, 0x1
> +**   ins     v0.h\[7\], \1\.h\[0\]
> +**   ret
> +*/
> +
> +int16x8_t f_s16(int16_t x)
> +{
> +  return (int16x8_t) { x, x, x, x, x, x, x, 1 };
> +}
> +
> +/*
> +** f_s32:
> +**   dup     v0.4s, w0
> +**   movi    (v[0-9])\.2s, 0x1
> +**   ins     v0.s\[3\], \1\.s\[0\]
> +**   ret
> +*/
> +
> +int32x4_t f_s32(int32_t x)
> +{
> +  return (int32x4_t) { x, x, x, 1 };
> +}
> +
> +/*
> +** f_s64:
> +**   adrp    x[0-9]+, .LC[0-9]+
> +**   ldr     q0, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> +**   ins     v0\.d\[0\], x0
> +**   ret
> +*/
> +
> +int64x2_t f_s64(int64_t x)
> +{
> +  return (int64x2_t) { x, 1 };
> +}

Reply via email to