Tamar Christina <tamar.christ...@arm.com> writes: >> -----Original Message----- >> From: Richard Sandiford <richard.sandif...@arm.com> >> Sent: Friday, January 3, 2025 10:59 AM >> To: Tamar Christina <tamar.christ...@arm.com> >> Cc: gcc-patches@gcc.gnu.org; nd <n...@arm.com>; Richard Earnshaw >> <richard.earns...@arm.com>; ktkac...@gcc.gnu.org >> Subject: Re: [PATCH]AArch64: Implement four and eight chunk VLA concats >> [PR118272] >> >> Tamar Christina <tamar.christ...@arm.com> writes: >> >> > >> >> > How about instead doing something like: >> >> > >> >> > worklist.reserve (nelts); >> >> > for (int i = 0; i < nelts; ++i) >> >> > worklist.quick_push (force_reg (elem_mode, XVECEXP (vals, 0, i))); >> >> > >> >> > while (nelts > 2) >> >> > { >> >> > for (int i = 0; i < nelts; i += 2) >> >> > { >> >> > ...read worklist[i] and worklist[i + 1]... >> >> > ...write back to worklist[i / 2]... >> >> > } >> >> > nelts /= 2; >> >> > } >> >> > >> >> > emit_insn (gen_aarch64_pack_partial (mode, target, worklist[0], >> >> > worklist[1])); >> >> > >> >> >> >> I think this is better since we above the memmove.. Apologies for the bug >> >> ☹ >> >> >> >> I'll respin. >> > >> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. >> > >> > Ok for master? >> > >> > Thanks, >> > Tamar >> > >> > gcc/ChangeLog: >> > >> > PR target/96342 >> > PR target/118272 >> > * config/aarch64/aarch64-sve.md (vec_init<mode><Vquad>, >> > vec_initvnx16qivnx2qi): New. >> > * config/aarch64/aarch64.cc >> > (aarch64_sve_expand_vector_init_subvector): >> > Rewrite to support any arbitrary combinations. >> > * config/aarch64/iterators.md (SVE_NO2E): Update to use SVE_NO4E >> > (SVE_NO2E, Vquad): New. >> > >> > gcc/testsuite/ChangeLog: >> > >> > PR target/96342 >> > PR target/118272 >> > * gcc.target/aarch64/vect-simd-clone-3.c: New test. >> > >> > -- inline copy of patch -- >> > >> > diff --git a/gcc/config/aarch64/aarch64-sve.md >> > b/gcc/config/aarch64/aarch64- >> sve.md >> > index >> 6659bb4fcab34699f22ff883825de1cd67108203..35f55bfacfc3238a8a7aa69015 >> f36ba32981af59 100644 >> > --- a/gcc/config/aarch64/aarch64-sve.md >> > +++ b/gcc/config/aarch64/aarch64-sve.md >> > @@ -2839,6 +2839,7 @@ (define_expand "vec_init<mode><Vel>" >> > } >> > ) >> > >> > +;; Vector constructor combining two half vectors { a, b } >> > (define_expand "vec_init<mode><Vhalf>" >> > [(match_operand:SVE_NO2E 0 "register_operand") >> > (match_operand 1 "")] >> > @@ -2849,6 +2850,28 @@ (define_expand "vec_init<mode><Vhalf>" >> > } >> > ) >> > >> > +;; Vector constructor combining four quad vectors { a, b, c, d } >> > +(define_expand "vec_init<mode><Vquad>" >> > + [(match_operand:SVE_NO4E 0 "register_operand") >> > + (match_operand 1 "")] >> > + "TARGET_SVE" >> > + { >> > + aarch64_sve_expand_vector_init_subvector (operands[0], operands[1]); >> > + DONE; >> > + } >> > +) >> > + >> > +;; Vector constructor combining eight vectors { a, b, c, d, ... } >> > +(define_expand "vec_initvnx16qivnx2qi" >> > + [(match_operand:VNx16QI 0 "register_operand") >> > + (match_operand 1 "")] >> > + "TARGET_SVE" >> > + { >> > + aarch64_sve_expand_vector_init_subvector (operands[0], operands[1]); >> > + DONE; >> > + } >> > +) >> > + >> > ;; Shift an SVE vector left and insert a scalar into element 0. >> > (define_insn "vec_shl_insert_<mode>" >> > [(set (match_operand:SVE_FULL 0 "register_operand") >> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc >> > index >> cb9b155826d12b622ae0df1736e4b042d01cf56a..c7c165ddb3a825a4e5ce4fc1 >> 18d39047ce9bf7e8 100644 >> > --- a/gcc/config/aarch64/aarch64.cc >> > +++ b/gcc/config/aarch64/aarch64.cc >> > @@ -24898,18 +24898,42 @@ aarch64_sve_expand_vector_init_subvector (rtx >> target, rtx vals) >> > machine_mode mode = GET_MODE (target); >> > int nelts = XVECLEN (vals, 0); >> > >> > - gcc_assert (nelts == 2); >> > + gcc_assert (nelts % 2 == 0); >> > >> > - rtx arg0 = XVECEXP (vals, 0, 0); >> > - rtx arg1 = XVECEXP (vals, 0, 1); >> > - >> > - /* If we have two elements and are concatting vector. */ >> > - machine_mode elem_mode = GET_MODE (arg0); >> > + /* We have to be concatting vector. */ >> > + machine_mode elem_mode = GET_MODE (XVECEXP (vals, 0, 0)); >> > gcc_assert (VECTOR_MODE_P (elem_mode)); >> > >> > - arg0 = force_reg (elem_mode, arg0); >> > - arg1 = force_reg (elem_mode, arg1); >> > - emit_insn (gen_aarch64_pack_partial (mode, target, arg0, arg1)); >> > + auto_vec<rtx> worklist; >> > + machine_mode wider_mode = elem_mode; >> > + >> > + for (int i = 0; i < nelts; i++) >> > + worklist.safe_push (force_reg (elem_mode, XVECEXP (vals, 0, i))); >> > + >> > + /* Keep widening pairwise to have maximum throughput. */ >> > + while (nelts >= 2) >> > + { >> > + wider_mode >> > + = related_vector_mode (wider_mode, GET_MODE_INNER (wider_mode), >> > + GET_MODE_NUNITS (wider_mode) * 2).require >> > (); >> >> Formatting nit, sorry, but: the last two lines seem to be indented >> two columns too far. >> >> > + >> > + for (int i = 0; i < nelts; i += 2) >> > + { >> > + rtx arg0 = worklist[i]; >> > + rtx arg1 = worklist[i+1]; >> > + gcc_assert (GET_MODE (arg0) == GET_MODE (arg1)); >> > + >> > + rtx tmp = gen_reg_rtx (wider_mode); >> > + emit_insn (gen_aarch64_pack_partial (wider_mode, tmp, arg0, >> > arg1)); >> > + worklist[i / 2] = tmp; >> > + } >> > + >> > + nelts /= 2; >> > + } >> > + >> > + gcc_assert (wider_mode == mode); >> > + emit_move_insn (target, worklist[0]); >> >> The reason I'd suggested using "while (nelts > 2)" and ending with: >> >> emit_insn (gen_aarch64_pack_partial (mode, target, worklist[0], >> worklist[1])); >> >> is that it avoids an unnecessary temporary that would otherwise need to >> be cleaned up later. But I won't insist. > > Ah I see, the reason I had >= is because I wanted the mode assert which seemed > useful to check that the right number of elements were processed. > > I think emit_move_insn would assert when the modes don't match, but emit_insn > Just makes the set no?
Yeah, that's right. It'd instead ICE in vregs, because the insn wouldn't be recognised. > Happy to go with either and will change it to your suggestion, but just > wanted to point > out why I hadn't before. I'm happy with your version too. :) Richard > Cheers, > Tamar > >> >> OK with the formatting nit fixed, thanks. >> >> Richard >> >> >> >> > + >> > return; >> > } >> > >> > diff --git a/gcc/config/aarch64/iterators.md >> > b/gcc/config/aarch64/iterators.md >> > index >> 07b97547cb292e6d4dc1040173a5d5525fb268d5..60bad5c72bc667be19f8224 >> af87ec5451b4b1a9a 100644 >> > --- a/gcc/config/aarch64/iterators.md >> > +++ b/gcc/config/aarch64/iterators.md >> > @@ -140,9 +140,12 @@ (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI]) >> > ;; VQ without 2 element modes. >> > (define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF]) >> > >> > +;; SVE modes without 2 and 4 element modes. >> > +(define_mode_iterator SVE_NO4E [VNx16QI VNx8QI VNx8HI VNx8HF >> VNx8BF]) >> > + >> > ;; SVE modes without 2 element modes. >> > -(define_mode_iterator SVE_NO2E [VNx16QI VNx8QI VNx4QI VNx8HI VNx4HI >> VNx8HF >> > - VNx4HF VNx8BF VNx4BF VNx4SI VNx4SF]) >> > +(define_mode_iterator SVE_NO2E [SVE_NO4E VNx4QI VNx4HI VNx4HF >> VNx4BF VNx4SI >> > + VNx4SF]) >> > >> > ;; 2 element quad vector modes. >> > (define_mode_iterator VQ_2E [V2DI V2DF]) >> > @@ -1764,6 +1767,11 @@ (define_mode_attr Vhalf [(V8QI "v4qi") (V16QI >> "v8qi") >> > (VNx8BF "vnx4bf") (VNx4BF "vnx2bf") >> > (VNx4SI "vnx2si") (VNx4SF "vnx2sf")]) >> > >> > +;; Quad modes of all vector modes, in lower-case. >> > +(define_mode_attr Vquad [(VNx16QI "vnx4qi") (VNx8QI "vnx2qi") >> > + (VNx8HI "vnx2hi") (VNx8HF "vnx2hf") >> > + (VNx8BF "vnx2bf")]) >> > + >> > ;; Single-element half modes of quad vector modes. >> > (define_mode_attr V1HALF [(V2DI "V1DI") (V2DF "V1DF")]) >> > >> > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c >> b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c >> > new file mode 100644 >> > index >> 0000000000000000000000000000000000000000..fb50da1f714ce929da0e35 >> b7d9fead2d93476cb9 >> > --- /dev/null >> > +++ b/gcc/testsuite/gcc.target/aarch64/vect-simd-clone-3.c >> > @@ -0,0 +1,27 @@ >> > +/* { dg-do compile } */ >> > +/* { dg-options "-std=c99" } */ >> > +/* { dg-additional-options "-O3 -march=armv8-a" } */ >> > + >> > +#pragma GCC target ("+sve") >> > +extern char __attribute__ ((simd, const)) fn3 (int, short); >> > +void test_fn3 (float *a, float *b, double *c, int n) >> > +{ >> > + for (int i = 0; i < n; ++i) >> > + a[i] = fn3 (b[i], c[i]); >> > +} >> > + >> > +/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn3\n} } } */ >> > + >> > +extern char __attribute__ ((simd, const)) fn4 (int, char); >> > +void test_fn4 (float *a, float *b, double *c, int n) >> > +{ >> > + for (int i = 0; i < n; ++i) >> > + a[i] = fn4 (b[i], c[i]); >> > +} >> > + >> > +/* { dg-final { scan-assembler {\s+_ZGVsMxvv_fn4\n} } } */ >> > + >> > +/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.b, z[0-9]+\.b, >> > z[0- >> 9]+\.b\n} 6 } } */ >> > +/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.h, z[0-9]+\.h, >> > z[0- >> 9]+\.h\n} 16 } } */ >> > +/* { dg-final { scan-assembler-times {\s+uzp1\tz[0-9]+\.s, z[0-9]+\.s, >> > z[0- >> 9]+\.s\n} 24 } } */ >> > +