Re: [PATCH]AArch64: Implement four and eight chunk VLA concats [PR118272]

Richard Sandiford Fri, 03 Jan 2025 10:14:29 -0800

Tamar Christina <tamar.christ...@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandif...@arm.com>
>> Sent: Friday, January 3, 2025 10:59 AM
>> To: Tamar Christina <tamar.christ...@arm.com>
>> Cc: gcc-patches@gcc.gnu.org; nd <n...@arm.com>; Richard Earnshaw
>> <richard.earns...@arm.com>; ktkac...@gcc.gnu.org
>> Subject: Re: [PATCH]AArch64: Implement four and eight chunk VLA concats
>> [PR118272]
>> 
>> Tamar Christina <tamar.christ...@arm.com> writes:
>> >> >
>> >> > How about instead doing something like:
>> >> >
>> >> >   worklist.reserve (nelts);
>> >> >   for (int i = 0; i < nelts; ++i)
>> >> >     worklist.quick_push (force_reg (elem_mode, XVECEXP (vals, 0, i)));
>> >> >
>> >> >   while (nelts > 2)
>> >> >     {
>> >> >       for (int i = 0; i < nelts; i += 2)
>> >> >         {
>> >> >           ...read worklist[i] and worklist[i + 1]...
>> >> >           ...write back to worklist[i / 2]...
>> >> >         }
>> >> >       nelts /= 2;
>> >> >     }
>> >> >
>> >> >   emit_insn (gen_aarch64_pack_partial (mode, target, worklist[0],
>> >> >                                    worklist[1]));
>> >> >
>> >>
>> >> I think this is better since we above the memmove.. Apologies for the bug 
>> >> ☹
>> >>
>> >> I'll respin.
>> >
>> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>> >
>> > Ok for master?
>> >
>> > Thanks,
>> > Tamar
>> >
>> > gcc/ChangeLog:
>> >
>> >         PR target/96342
>> >         PR target/118272
>> >         * config/aarch64/aarch64-sve.md (vec_init<mode><Vquad>,
>> >         vec_initvnx16qivnx2qi): New.
>> >         * config/aarch64/aarch64.cc 
>> > (aarch64_sve_expand_vector_init_subvector):
>> >         Rewrite to support any arbitrary combinations.
>> >         * config/aarch64/iterators.md (SVE_NO2E): Update to use SVE_NO4E
>> >         (SVE_NO2E, Vquad): New.
>> >
>> > gcc/testsuite/ChangeLog:
>> >
>> >         PR target/96342
>> >         PR target/118272
>> >         * gcc.target/aarch64/vect-simd-clone-3.c: New test.
>> >
>> > -- inline copy of patch --
>> >
>> > diff --git a/gcc/config/aarch64/aarch64-sve.md 
>> > b/gcc/config/aarch64/aarch64-
>> sve.md
>> > index
>> 6659bb4fcab34699f22ff883825de1cd67108203..35f55bfacfc3238a8a7aa69015
>> f36ba32981af59 100644
>> > --- a/gcc/config/aarch64/aarch64-sve.md
>> > +++ b/gcc/config/aarch64/aarch64-sve.md
>> > @@ -2839,6 +2839,7 @@ (define_expand "vec_init<mode><Vel>"
>> >    }
>> >  )
>> >
>> > +;; Vector constructor combining two half vectors { a, b }
>> >  (define_expand "vec_init<mode><Vhalf>"
>> >    [(match_operand:SVE_NO2E 0 "register_operand")
>> >     (match_operand 1 "")]
>> > @@ -2849,6 +2850,28 @@ (define_expand "vec_init<mode><Vhalf>"
>> >    }
>> >  )
>> >
>> > +;; Vector constructor combining four quad vectors { a, b, c, d }
>> > +(define_expand "vec_init<mode><Vquad>"
>> > +  [(match_operand:SVE_NO4E 0 "register_operand")
>> > +   (match_operand 1 "")]
>> > +  "TARGET_SVE"
>> > +  {
>> > +    aarch64_sve_expand_vector_init_subvector (operands[0], operands[1]);
>> > +    DONE;
>> > +  }
>> > +)
>> > +
>> > +;; Vector constructor combining eight vectors { a, b, c, d, ... }
>> > +(define_expand "vec_initvnx16qivnx2qi"
>> > +  [(match_operand:VNx16QI 0 "register_operand")
>> > +   (match_operand 1 "")]
>> > +  "TARGET_SVE"
>> > +  {
>> > +    aarch64_sve_expand_vector_init_subvector (operands[0], operands[1]);
>> > +    DONE;
>> > +  }
>> > +)
>> > +
>> >  ;; Shift an SVE vector left and insert a scalar into element 0.
>> >  (define_insn "vec_shl_insert_<mode>"
>> >    [(set (match_operand:SVE_FULL 0 "register_operand")
>> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> > index
>> cb9b155826d12b622ae0df1736e4b042d01cf56a..c7c165ddb3a825a4e5ce4fc1
>> 18d39047ce9bf7e8 100644
>> > --- a/gcc/config/aarch64/aarch64.cc
>> > +++ b/gcc/config/aarch64/aarch64.cc
>> > @@ -24898,18 +24898,42 @@ aarch64_sve_expand_vector_init_subvector (rtx
>> target, rtx vals)
>> >    machine_mode mode = GET_MODE (target);
>> >    int nelts = XVECLEN (vals, 0);
>> >
>> > -  gcc_assert (nelts == 2);
>> > +  gcc_assert (nelts % 2 == 0);
>> >
>> > -  rtx arg0 = XVECEXP (vals, 0, 0);
>> > -  rtx arg1 = XVECEXP (vals, 0, 1);
>> > -
>> > -  /* If we have two elements and are concatting vector.  */
>> > -  machine_mode elem_mode = GET_MODE (arg0);
>> > +  /* We have to be concatting vector.  */
>> > +  machine_mode elem_mode = GET_MODE (XVECEXP (vals, 0, 0));
>> >    gcc_assert (VECTOR_MODE_P (elem_mode));
>> >
>> > -  arg0 = force_reg (elem_mode, arg0);
>> > -  arg1 = force_reg (elem_mode, arg1);
>> > -  emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1));
>> > +  auto_vec<rtx> worklist;
>> > +  machine_mode wider_mode = elem_mode;
>> > +
>> > +  for (int i = 0; i < nelts; i++)
>> > +    worklist.safe_push (force_reg (elem_mode, XVECEXP (vals, 0, i)));
>> > +
>> > +  /* Keep widening pairwise to have maximum throughput.  */
>> > +  while (nelts >= 2)
>> > +    {
>> > +      wider_mode
>> > +         = related_vector_mode (wider_mode, GET_MODE_INNER (wider_mode),
>> > +                                GET_MODE_NUNITS (wider_mode) * 2).require 
>> > ();
>> 
>> Formatting nit, sorry, but: the last two lines seem to be indented
>> two columns too far.
>> 
>> > +
>> > +      for (int i = 0; i < nelts; i += 2)
>> > +       {
>> > +         rtx arg0 = worklist[i];
>> > +         rtx arg1 = worklist[i+1];
>> > +         gcc_assert (GET_MODE (arg0) == GET_MODE (arg1));
>> > +
>> > +         rtx tmp = gen_reg_rtx (wider_mode);
>> > +         emit_insn (gen_aarch64_pack_partial (wider_mode, tmp, arg0, 
>> > arg1));
>> > +         worklist[i / 2] = tmp;
>> > +       }
>> > +
>> > +      nelts /= 2;
>> > +    }
>> > +
>> > +  gcc_assert (wider_mode == mode);
>> > +  emit_move_insn (target, worklist[0]);
>> 
>> The reason I'd suggested using "while (nelts > 2)" and ending with:
>> 
>>   emit_insn (gen_aarch64_pack_partial (mode, target, worklist[0],
>>                                     worklist[1]));
>> 
>> is that it avoids an unnecessary temporary that would otherwise need to
>> be cleaned up later.  But I won't insist.
>
> Ah I see, the reason I had >= is because I wanted the mode assert which seemed
> useful to check that the right number of elements were processed.
>
> I think emit_move_insn would assert when the modes don't match, but emit_insn
> Just makes the set no?


Yeah, that's right.  It'd instead ICE in vregs, because the insn
wouldn't be recognised.

> Happy to go with either and will change it to your suggestion, but just 
> wanted to point
> out why I hadn't before.

I'm happy with your version too. :)

Richard

> Cheers,
> Tamar
>
>> 
>> OK with the formatting nit fixed, thanks.
>> 
>> Richard
>> 
>> 
>> 
>> > +
>> >    return;
>> >  }
>> >
>> > diff --git a/gcc/config/aarch64/iterators.md 
>> > b/gcc/config/aarch64/iterators.md
>> > index
>> 07b97547cb292e6d4dc1040173a5d5525fb268d5..60bad5c72bc667be19f8224
>> af87ec5451b4b1a9a 100644
>> > --- a/gcc/config/aarch64/iterators.md
>> > +++ b/gcc/config/aarch64/iterators.md
>> > @@ -140,9 +140,12 @@ (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
>> >  ;; VQ without 2 element modes.
>> >  (define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF])
>> >
>> > +;; SVE modes without 2 and 4 element modes.
>> > +(define_mode_iterator SVE_NO4E [VNx16QI VNx8QI VNx8HI VNx8HF
>> VNx8BF])
>> > +
>> >  ;; SVE modes without 2 element modes.
>> > -(define_mode_iterator SVE_NO2E [VNx16QI VNx8QI VNx4QI VNx8HI VNx4HI
>> VNx8HF
>> > -                               VNx4HF VNx8BF VNx4BF VNx4SI VNx4SF])
>> > +(define_mode_iterator SVE_NO2E [SVE_NO4E VNx4QI VNx4HI VNx4HF
>> VNx4BF VNx4SI
>> > +                               VNx4SF])
>> >
>> >  ;; 2 element quad vector modes.
>> >  (define_mode_iterator VQ_2E [V2DI V2DF])
>> > @@ -1764,6 +1767,11 @@ (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI
>> "v8qi")
>> >                          (VNx8BF "vnx4bf")  (VNx4BF "vnx2bf")
>> >                          (VNx4SI "vnx2si")  (VNx4SF "vnx2sf")])
>> >
>> > +;; Quad modes of all vector modes, in lower-case.
>> > +(define_mode_attr Vquad [(VNx16QI "vnx4qi") (VNx8QI "vnx2qi")
>> > +                        (VNx8HI "vnx2hi")  (VNx8HF "vnx2hf")
>> > +                        (VNx8BF "vnx2bf")])
>> > +
>> >  ;; Single-element half modes of quad vector modes.
>> >  (define_mode_attr V1HALF [(V2DI "V1DI")  (V2DF  "V1DF")])
>> >
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c
>> b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c
>> > new file mode 100644
>> > index
>> 0000000000000000000000000000000000000000..fb50da1f714ce929da0e35
>> b7d9fead2d93476cb9
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c
>> > @@ -0,0 +1,27 @@
>> > +/* { dg-do compile }  */
>> > +/* { dg-options "-std=c99" } */
>> > +/* { dg-additional-options "-O3 -march=armv8-a" } */
>> > +
>> > +#pragma GCC target ("+sve")
>> > +extern char __attribute__ ((simd, const)) fn3 (int, short);
>> > +void test_fn3 (float *a, float *b, double *c, int n)
>> > +{
>> > +  for (int i = 0; i < n; ++i)
>> > +    a[i] = fn3 (b[i], c[i]);
>> > +}
>> > +
>> > +/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn3\n} } } */
>> > +
>> > +extern char __attribute__ ((simd, const)) fn4 (int, char);
>> > +void test_fn4 (float *a, float *b, double *c, int n)
>> > +{
>> > +  for (int i = 0; i < n; ++i)
>> > +    a[i] = fn4 (b[i], c[i]);
>> > +}
>> > +
>> > +/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn4\n} } } */
>> > +
>> > +/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.b, z[0-9]+\.b, 
>> > z[0-
>> 9]+\.b\n} 6 } } */
>> > +/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.h, z[0-9]+\.h, 
>> > z[0-
>> 9]+\.h\n} 16 } } */
>> > +/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.s, z[0-9]+\.s, 
>> > z[0-
>> 9]+\.s\n} 24 } } */
>> > +

Re: [PATCH]AArch64: Implement four and eight chunk VLA concats [PR118272]

Reply via email to