RE: [PATCH] aarch64: Improve scalar mode popcount expansion by using SVE [PR113860]

2024-09-26 Thread Pengxuan Zheng (QUIC)
> Pengxuan Zheng  writes:
> > This is similar to the recent improvements to the Advanced SIMD
> > popcount expansion by using SVE. We can utilize SVE to generate more
> > efficient code for scalar mode popcount too.
> >
> > PR target/113860
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-simd.md (popcount2): Update
> pattern to
> > also support V1DI mode.
> > * config/aarch64/aarch64.md (popcount2): Add TARGET_SVE
> support.
> > * config/aarch64/iterators.md (VDQHSD_V1DI): New mode iterator.
> > (SVE_VDQ_I): Add V1DI.
> > (bitsize): Likewise.
> > (VPRED): Likewise.
> > (VEC_POP_MODE): New mode attribute.
> > (vec_pop_mode): Likewise.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/popcnt11.c: New test.
> 
> Sorry for the slow review of this.  The main reason for putting it off was the
> use of V1DI, which always makes me nervous.
> 
> In particular:
> 
> > @@ -2284,7 +2286,7 @@ (define_mode_attr VPRED [(VNx16QI "VNx16BI")
> (VNx8QI "VNx8BI")
> >  (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
> >  (V8QI "VNx8BI") (V16QI "VNx16BI")
> >  (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
> > -(V4SI "VNx4BI") (V2DI "VNx2BI")])
> > +(V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx2BI")])
> >
> 
> it seems odd to have a predicate mode that contains more elements than the
> associated single-vector data mode.

I agree. I've added a new predicate mode VNx1BI in v2 of the patch. Please let 
me know if I missed anything.

> 
> The patch also extends the non-SVE SIMD popcount pattern for V1DI, but it
> doesn't look like that path works.  E.g. try the following with -march=armv8-a
> -fgimple -O2:
> 
> __Uint64x1_t __GIMPLE
> foo (__Uint64x1_t x)
> {
>   __Uint64x1_t z;
> 
>   z = .POPCOUNT (x);
>   return z;
> }

Good catch! Thanks, Richard. I've fixed this issue and added the example as a 
test case in v2.

v2: https://gcc.gnu.org/pipermail/gcc-patches/2024-September/663916.html

Thanks,
Pengxuan
> 
> Thanks,
> Richard
> 
> 
> >  ;; ...and again in lower case.
> >  (define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi") @@
> > -2318,6 +2320,14 @@ (define_mode_attr VDOUBLE [(VNx16QI "VNx32QI")
> >(VNx4SI "VNx8SI") (VNx4SF "VNx8SF")
> >(VNx2DI "VNx4DI") (VNx2DF "VNx4DF")])
> >
> > +;; The Advanced SIMD modes of popcount corresponding to scalar modes.
> > +(define_mode_attr VEC_POP_MODE [(QI "V8QI") (HI "V4HI")
> > +   (SI "V2SI") (DI "V1DI")])
> > +
> > +;; ...and again in lower case.
> > +(define_mode_attr vec_pop_mode [(QI "v8qi") (HI "v4hi")
> > +   (SI "v2si") (DI "v1di")])
> > +
> >  ;; On AArch64 the By element instruction doesn't have a 2S variant.
> >  ;; However because the instruction always selects a pair of values
> > ;; The normal 3SAME instruction can be used here instead.
> > diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt11.c
> > b/gcc/testsuite/gcc.target/aarch64/popcnt11.c
> > new file mode 100644
> > index 000..595b2f9eb93
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt11.c
> > @@ -0,0 +1,58 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -march=armv8.2-a+sve" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +/*
> > +** f_qi:
> > +** ldr b([0-9]+), \[x0\]
> > +** cnt v\1.8b, v\1.8b
> > +** smovw0, v\1.b\[0\]
> > +** ret
> > +*/
> > +unsigned
> > +f_qi (unsigned char *a)
> > +{
> > +  return __builtin_popcountg (a[0]);
> > +}
> > +
> > +/*
> > +** f_hi:
> > +** ldr h([0-9]+), \[x0\]
> > +** ptrue   (p[0-7]).b, all
> > +** cnt z\1.h, \2/m, z\1.h
> > +** smovw0, v\1.h\[0\]
> > +** ret
> > +*/
> > +unsigned
> > +f_hi (unsigned short *a)
> > +{
> > +  return __builtin_popcountg (a[0]);
> > +}
> > +
> > +/*
> > +** f_si:
> > +** ldr s([0-9]+), \[x0\]
> > +** ptrue   (p[0-7]).b, all
> > +** cnt z\1.s, \2/m, z\1.s
> > +** umovx0, v\1.d\[0\]
> > +** ret
> > +*/
> > +unsigned
> > +f_si (unsigned int *a)
> > +{
> > +  return __builtin_popcountg (a[0]);
> > +}
> > +
> > +/*
> > +** f_di:
> > +** ldr d([0-9]+), \[x0\]
> > +** ptrue   (p[0-7])\.b, all
> > +** cnt z\1\.d, \2/m, z\1\.d
> > +** fmovx0, d\1
> > +** ret
> > +*/
> > +unsigned
> > +f_di (unsigned long *a)
> > +{
> > +  return __builtin_popcountg (a[0]);
> > +}


[PATCH v2] aarch64: Improve scalar mode popcount expansion by using SVE [PR113860]

2024-09-26 Thread Pengxuan Zheng
This is similar to the recent improvements to the Advanced SIMD popcount
expansion by using SVE. We can utilize SVE to generate more efficient code for
scalar mode popcount too.

Changes since v1:
* v2: Add a new VNx1BI mode and a new test case for V1DI.

PR target/113860

gcc/ChangeLog:

* config/aarch64/aarch64-modes.def (VECTOR_BOOL_MODE): Add VNx1BI.
(ADJUST_NUNITS): Likewise.
(ADJUST_ALIGNMENT): Likewise.
* config/aarch64/aarch64-simd.md (popcount2): Update pattern to
also support V1DI mode.
* config/aarch64/aarch64.cc (aarch64_sve_pred_mode_p): Add VNx1BImode.
* config/aarch64/aarch64.md (popcount2): Add TARGET_SVE support.
* config/aarch64/iterators.md (VDQHSD_V1DI): New mode iterator.
(SVE_VDQ_I): Add V1DI.
(bitsize): Likewise.
(VPRED): Likewise.
(VEC_POP_MODE): New mode attribute.
(vec_pop_mode): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt11.c: New test.
* gcc.target/aarch64/popcnt12.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-modes.def|  3 ++
 gcc/config/aarch64/aarch64-simd.md  | 13 -
 gcc/config/aarch64/aarch64.cc   |  3 +-
 gcc/config/aarch64/aarch64.md   |  9 
 gcc/config/aarch64/iterators.md | 16 --
 gcc/testsuite/gcc.target/aarch64/popcnt11.c | 58 +
 gcc/testsuite/gcc.target/aarch64/popcnt12.c | 18 +++
 7 files changed, 114 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt11.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt12.c

diff --git a/gcc/config/aarch64/aarch64-modes.def 
b/gcc/config/aarch64/aarch64-modes.def
index 25a22c1195e..d822d4dfc13 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -53,18 +53,21 @@ VECTOR_BOOL_MODE (VNx16BI, 16, BI, 2);
 VECTOR_BOOL_MODE (VNx8BI, 8, BI, 2);
 VECTOR_BOOL_MODE (VNx4BI, 4, BI, 2);
 VECTOR_BOOL_MODE (VNx2BI, 2, BI, 2);
+VECTOR_BOOL_MODE (VNx1BI, 1, BI, 2);
 
 ADJUST_NUNITS (VNx32BI, aarch64_sve_vg * 16);
 ADJUST_NUNITS (VNx16BI, aarch64_sve_vg * 8);
 ADJUST_NUNITS (VNx8BI, aarch64_sve_vg * 4);
 ADJUST_NUNITS (VNx4BI, aarch64_sve_vg * 2);
 ADJUST_NUNITS (VNx2BI, aarch64_sve_vg);
+ADJUST_NUNITS (VNx1BI, exact_div (aarch64_sve_vg, 2));
 
 ADJUST_ALIGNMENT (VNx32BI, 2);
 ADJUST_ALIGNMENT (VNx16BI, 2);
 ADJUST_ALIGNMENT (VNx8BI, 2);
 ADJUST_ALIGNMENT (VNx4BI, 2);
 ADJUST_ALIGNMENT (VNx2BI, 2);
+ADJUST_ALIGNMENT (VNx1BI, 2);
 
 /* Bfloat16 modes.  */
 FLOAT_MODE (BF, 2, 0);
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 23c03a96371..386b1fa1f4b 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3515,8 +3515,9 @@ (define_insn "popcount2"
 )
 
 (define_expand "popcount2"
-  [(set (match_operand:VDQHSD 0 "register_operand")
-   (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
+  [(set (match_operand:VDQHSD_V1DI 0 "register_operand")
+   (popcount:VDQHSD_V1DI
+ (match_operand:VDQHSD_V1DI 1 "register_operand")))]
   "TARGET_SIMD"
   {
 if (TARGET_SVE)
@@ -3528,6 +3529,14 @@ (define_expand "popcount2"
DONE;
   }
 
+if (mode == V1DImode)
+  {
+   rtx out = gen_reg_rtx (DImode);
+   emit_insn (gen_popcountdi2 (out, gen_lowpart (DImode, operands[1])));
+   emit_move_insn (operands[0], gen_lowpart (mode, out));
+   DONE;
+  }
+
 /* Generate a byte popcount.  */
 machine_mode mode =  == 64 ? V8QImode : V16QImode;
 rtx tmp = gen_reg_rtx (mode);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 92763d403c7..78f65f886b7 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -1476,7 +1476,8 @@ aarch64_sve_pred_mode_p (machine_mode mode)
  && (mode == VNx16BImode
  || mode == VNx8BImode
  || mode == VNx4BImode
- || mode == VNx2BImode));
+ || mode == VNx2BImode
+ || mode == VNx1BImode));
 }
 
 /* Three mutually-exclusive flags describing a vector or predicate type.  */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c54b29cd64b..ef52770f1cb 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5345,6 +5345,15 @@ (define_expand "popcount2"
(popcount:ALLI (match_operand:ALLI 1 "register_operand")))]
   "TARGET_CSSC ? GET_MODE_BITSIZE (mode) >= 32 : TARGET_SIMD"
 {
+  if (!TARGET_CSSC && TARGET_SVE && mode != QImode)
+{
+  rtx tmp = gen_reg_rtx (mode);
+  rtx op1 = gen_lowpart (mode, operands[1]);
+  emit_insn (gen_popcount2 (tmp, op1));
+  emit_move_insn (operands[0], gen_lowpart (mode, tmp))

RE: [PATCH v2 2/2] aarch64: Improve part-variable vector initialization with SVE INDEX instruction [PR113328]

2024-09-18 Thread Pengxuan Zheng (QUIC)
> > Pengxuan Zheng  writes:
> > > We can still use SVE's INDEX instruction to construct vectors even
> > > if not all elements are constants. For example, { 0, x, 2, 3 } can
> > > be constructed by first using "INDEX #0, #1" to generate { 0, 1, 2,
> > > 3 }, and then set the elements which are non-constants separately.
> > >
> > >   PR target/113328
> > >
> > > gcc/ChangeLog:
> > >
> > >   * config/aarch64/aarch64.cc (aarch64_expand_vector_init_fallback):
> > >   Improve part-variable vector generation with SVE's INDEX if
> > TARGET_SVE
> > >   is available.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >   * gcc.target/aarch64/sve/acle/general/dupq_1.c: Update test to use
> > >   check-function-bodies.
> > >   * gcc.target/aarch64/sve/acle/general/dupq_2.c: Likewise.
> > >   * gcc.target/aarch64/sve/acle/general/dupq_3.c: Likewise.
> > >   * gcc.target/aarch64/sve/acle/general/dupq_4.c: Likewise.
> > >   * gcc.target/aarch64/sve/vec_init_4.c: New test.
> > >   * gcc.target/aarch64/sve/vec_init_5.c: New test.
> > >
> > > Signed-off-by: Pengxuan Zheng 
> > > ---
> > >  gcc/config/aarch64/aarch64.cc | 81 ++-
> > >  .../aarch64/sve/acle/general/dupq_1.c | 18 -
> > >  .../aarch64/sve/acle/general/dupq_2.c | 18 -
> > >  .../aarch64/sve/acle/general/dupq_3.c | 18 -
> > >  .../aarch64/sve/acle/general/dupq_4.c | 18 -
> > >  .../gcc.target/aarch64/sve/vec_init_4.c   | 47 +++
> > >  .../gcc.target/aarch64/sve/vec_init_5.c   | 12 +++
> > >  7 files changed, 199 insertions(+), 13 deletions(-)  create mode
> > > 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_init_4.c
> > >  create mode 100644
> > > gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c
> > >
> > > diff --git a/gcc/config/aarch64/aarch64.cc
> > > b/gcc/config/aarch64/aarch64.cc index 6b3ca57d0eb..7305a5c6375
> > > 100644
> > > --- a/gcc/config/aarch64/aarch64.cc
> > > +++ b/gcc/config/aarch64/aarch64.cc
> > > @@ -23942,12 +23942,91 @@ aarch64_expand_vector_init_fallback (rtx
> > target, rtx vals)
> > >if (n_var != n_elts)
> > >  {
> > >rtx copy = copy_rtx (vals);
> > > +  bool is_index_seq = false;
> > > +
> > > +  /* If at least half of the elements of the vector are constants 
> > > and all
> > > +  these constant elements form a linear sequence of the form { B, B
> > > ++
> > S,
> > > +  B + 2 * S, B + 3 * S, ... }, we can generate the vector with SVE's
> > > +  INDEX instruction if SVE is available and then set the elements which
> > > +  are not constant separately.  More precisely, each constant element I
> > > +  has to be B + I * S where B and S must be valid immediate operand
> > for
> > > +  an SVE INDEX instruction.
> > > +
> > > +  For example, { X, 1, 2, 3} is a vector satisfying these conditions and
> > > +  we can generate a vector of all constants (i.e., { 0, 1, 2, 3 }) first
> > > +  and then set the first element of the vector to X.  */
> > > +
> > > +  if (TARGET_SVE && GET_MODE_CLASS (mode) ==
> MODE_VECTOR_INT
> > > +   && n_var <= n_elts / 2)
> > > + {
> > > +   int const_idx = -1;
> > > +   HOST_WIDE_INT const_val = 0;
> > > +   int base = 16;
> > > +   int step = 16;
> > > +
> > > +   for (int i = 0; i < n_elts; ++i)
> > > + {
> > > +   rtx x = XVECEXP (vals, 0, i);
> > > +
> > > +   if (!CONST_INT_P (x))
> > > + continue;
> > > +
> > > +   if (const_idx == -1)
> > > + {
> > > +   const_idx = i;
> > > +   const_val = INTVAL (x);
> > > + }
> > > +   else
> > > + {
> > > +   if ((INTVAL (x) - const_val) % (i - const_idx) == 0)
> > > + {
> > > +   HOST_WIDE_INT s
> > > +   = (INTVAL (x) - const_val) / (i - const_idx);
> > > +   if (s >= -16 && s <= 15)
> > > + {
> > > +   int b = const_val - s * const_idx;
> > > +   if (b >= -16 && b <= 15)
> > > + {
> > > + 

RE: [PATCH 1/2] aarch64: Improve vector constant generation using SVE INDEX instruction [PR113328]

2024-09-17 Thread Pengxuan Zheng (QUIC)
> > > On 16 Sep 2024, at 16:32, Richard Sandiford
>  wrote:
> > >
> > > External email: Use caution opening links or attachments
> > >
> > >
> > > "Pengxuan Zheng (QUIC)"  writes:
> > >>> On Thu, Sep 12, 2024 at 2:53 AM Pengxuan Zheng
> > >>>  wrote:
> > >>>>
> > >>>> SVE's INDEX instruction can be used to populate vectors by values
> > >>>> starting from "base" and incremented by "step" for each
> > >>>> subsequent value. We can take advantage of it to generate vector
> > >>>> constants if TARGET_SVE is available and the base and step values are
> within [-16, 15].
> > >>>
> > >>> Are there multiplication by or addition of scalar immediate
> > >>> instructions to enhance this with two-instruction sequences?
> > >>
> > >> No, Richard, I can't think of any equivalent two-instruction sequences.
> > >
> > > There are some.  E.g.:
> > >
> > > { 16, 17, 18, 19, ... }
> > >
> > > could be:
> > >
> > >index   z0.b, #0, #1
> > >add z0.b, z0.b, #16
> > >
> > > or, alternatively:
> > >
> > >mov w0, #16
> > >index   z0.b, w0, #1
> 
> I guess even step between [16, 31] could be handed with index with half step
> and then adding the result to itself (multiply by immediate #2), even if 
> there's
> no direct vector-by-immediate instruction available.  Likewise of course  some
> { A0 + n * B1 + n * B2, ... } can be handled by adding two index compute
> results.

Thanks for the example, Richard! It does seem to be something worth looking 
into.

Thanks,
Pengxuan
> 
> > > But these cases are less obviously a win, so I think it's ok to
> > > handle single instructions only for now.
> >
> > (Not related to this patch, this work is great, thanks Pengxuan!)
> > Looking at some SWOGs like for Neoverse V2 it looks like the first sequence
> is preferable.
> > On that core the INDEX-immediates-only operation has latency 4 and
> throughput 2 and the SVE ADD is as cheap as SIMD operations can be on that
> core.
> > But in the second sequence the INDEX-reg-operand has latency 7 and
> throughput 1 as it seems to treat it as a GP <-> SIMD transfer of some sort.
> 
> So what's the latency/throughput of a vector load from constant pool (can we
> even have a "SVE" constant pool?  I assume entries would have to be of the
> architecturally largest vector size?), assuming it's in L1 (where it would 
> occupy
> quite some space eventually).
> 
> Richard.
> 
> > We may encounter a situation in the future where we’ll want to optimize the
> second sequence (if it comes from intrinsics code for example) into the first.
> > Thanks,
> > Kyrill
> >
> >
> > >
> > > The patch is ok for trunk, thanks, but:
> > >
> > >>>> @@ -22991,7 +22991,7 @@ aarch64_simd_valid_immediate (rtx op,
> > >>> simd_immediate_info *info,
> > >>>>   if (CONST_VECTOR_P (op)
> > >>>>   && CONST_VECTOR_DUPLICATE_P (op))
> > >>>> n_elts = CONST_VECTOR_NPATTERNS (op);
> > >>>> -  else if ((vec_flags & VEC_SVE_DATA)
> > >>>> +  else if (which == AARCH64_CHECK_MOV && TARGET_SVE
> > >>>>   && const_vec_series_p (op, &base, &step))
> > >
> > > ...the convention is to have one && condition per line if the whole
> > > expression doesn't fit on a single line:
> > >
> > >  else if (which == AARCH64_CHECK_MOV
> > >   && TARGET_SVE
> > >   && const_vec_series_p (op, &base, &step))
> > >
> > > Richard
> >


RE: [PATCH 1/2] aarch64: Improve vector constant generation using SVE INDEX instruction [PR113328]

2024-09-17 Thread Pengxuan Zheng (QUIC)
> > On 16 Sep 2024, at 16:32, Richard Sandiford 
> wrote:
> >
> > External email: Use caution opening links or attachments
> >
> >
> > "Pengxuan Zheng (QUIC)"  writes:
> >>> On Thu, Sep 12, 2024 at 2:53 AM Pengxuan Zheng
> >>>  wrote:
> >>>>
> >>>> SVE's INDEX instruction can be used to populate vectors by values
> >>>> starting from "base" and incremented by "step" for each subsequent
> >>>> value. We can take advantage of it to generate vector constants if
> >>>> TARGET_SVE is available and the base and step values are within [-16,
> 15].
> >>>
> >>> Are there multiplication by or addition of scalar immediate
> >>> instructions to enhance this with two-instruction sequences?
> >>
> >> No, Richard, I can't think of any equivalent two-instruction sequences.
> >
> > There are some.  E.g.:
> >
> > { 16, 17, 18, 19, ... }
> >
> > could be:
> >
> >index   z0.b, #0, #1
> >add z0.b, z0.b, #16
> >
> > or, alternatively:
> >
> >mov w0, #16
> >index   z0.b, w0, #1
> >
> > But these cases are less obviously a win, so I think it's ok to handle
> > single instructions only for now.
> 
> (Not related to this patch, this work is great, thanks Pengxuan!) Looking at
> some SWOGs like for Neoverse V2 it looks like the first sequence is 
> preferable.
> On that core the INDEX-immediates-only operation has latency 4 and
> throughput 2 and the SVE ADD is as cheap as SIMD operations can be on that
> core.
> But in the second sequence the INDEX-reg-operand has latency 7 and
> throughput 1 as it seems to treat it as a GP <-> SIMD transfer of some sort.
> We may encounter a situation in the future where we’ll want to optimize the
> second sequence (if it comes from intrinsics code for example) into the first.

This does look like something that we may want to consider improving in the 
future. Thanks for bringing it up and elaborating on it, Kyrylo!

Thanks,
Pengxuan

> Thanks,
> Kyrill
> 
> 
> >
> > The patch is ok for trunk, thanks, but:
> >
> >>>> @@ -22991,7 +22991,7 @@ aarch64_simd_valid_immediate (rtx op,
> >>> simd_immediate_info *info,
> >>>>   if (CONST_VECTOR_P (op)
> >>>>   && CONST_VECTOR_DUPLICATE_P (op))
> >>>> n_elts = CONST_VECTOR_NPATTERNS (op);
> >>>> -  else if ((vec_flags & VEC_SVE_DATA)
> >>>> +  else if (which == AARCH64_CHECK_MOV && TARGET_SVE
> >>>>   && const_vec_series_p (op, &base, &step))
> >
> > ...the convention is to have one && condition per line if the whole
> > expression doesn't fit on a single line:
> >
> >  else if (which == AARCH64_CHECK_MOV
> >   && TARGET_SVE
> >   && const_vec_series_p (op, &base, &step))
> >
> > Richard



RE: [PATCH v2 2/2] aarch64: Improve part-variable vector initialization with SVE INDEX instruction [PR113328]

2024-09-17 Thread Pengxuan Zheng (QUIC)
> Pengxuan Zheng  writes:
> > We can still use SVE's INDEX instruction to construct vectors even if
> > not all elements are constants. For example, { 0, x, 2, 3 } can be
> > constructed by first using "INDEX #0, #1" to generate { 0, 1, 2, 3 },
> > and then set the elements which are non-constants separately.
> >
> > PR target/113328
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64.cc (aarch64_expand_vector_init_fallback):
> > Improve part-variable vector generation with SVE's INDEX if
> TARGET_SVE
> > is available.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/sve/acle/general/dupq_1.c: Update test to use
> > check-function-bodies.
> > * gcc.target/aarch64/sve/acle/general/dupq_2.c: Likewise.
> > * gcc.target/aarch64/sve/acle/general/dupq_3.c: Likewise.
> > * gcc.target/aarch64/sve/acle/general/dupq_4.c: Likewise.
> > * gcc.target/aarch64/sve/vec_init_4.c: New test.
> > * gcc.target/aarch64/sve/vec_init_5.c: New test.
> >
> > Signed-off-by: Pengxuan Zheng 
> > ---
> >  gcc/config/aarch64/aarch64.cc | 81 ++-
> >  .../aarch64/sve/acle/general/dupq_1.c | 18 -
> >  .../aarch64/sve/acle/general/dupq_2.c | 18 -
> >  .../aarch64/sve/acle/general/dupq_3.c | 18 -
> >  .../aarch64/sve/acle/general/dupq_4.c | 18 -
> >  .../gcc.target/aarch64/sve/vec_init_4.c   | 47 +++
> >  .../gcc.target/aarch64/sve/vec_init_5.c   | 12 +++
> >  7 files changed, 199 insertions(+), 13 deletions(-)  create mode
> > 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_init_4.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c
> >
> > diff --git a/gcc/config/aarch64/aarch64.cc
> > b/gcc/config/aarch64/aarch64.cc index 6b3ca57d0eb..7305a5c6375 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -23942,12 +23942,91 @@ aarch64_expand_vector_init_fallback (rtx
> target, rtx vals)
> >if (n_var != n_elts)
> >  {
> >rtx copy = copy_rtx (vals);
> > +  bool is_index_seq = false;
> > +
> > +  /* If at least half of the elements of the vector are constants and 
> > all
> > +these constant elements form a linear sequence of the form { B, B +
> S,
> > +B + 2 * S, B + 3 * S, ... }, we can generate the vector with SVE's
> > +INDEX instruction if SVE is available and then set the elements which
> > +are not constant separately.  More precisely, each constant element I
> > +has to be B + I * S where B and S must be valid immediate operand
> for
> > +an SVE INDEX instruction.
> > +
> > +For example, { X, 1, 2, 3} is a vector satisfying these conditions and
> > +we can generate a vector of all constants (i.e., { 0, 1, 2, 3 }) first
> > +and then set the first element of the vector to X.  */
> > +
> > +  if (TARGET_SVE && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
> > + && n_var <= n_elts / 2)
> > +   {
> > + int const_idx = -1;
> > + HOST_WIDE_INT const_val = 0;
> > + int base = 16;
> > + int step = 16;
> > +
> > + for (int i = 0; i < n_elts; ++i)
> > +   {
> > + rtx x = XVECEXP (vals, 0, i);
> > +
> > + if (!CONST_INT_P (x))
> > +   continue;
> > +
> > + if (const_idx == -1)
> > +   {
> > + const_idx = i;
> > + const_val = INTVAL (x);
> > +   }
> > + else
> > +   {
> > + if ((INTVAL (x) - const_val) % (i - const_idx) == 0)
> > +   {
> > + HOST_WIDE_INT s
> > + = (INTVAL (x) - const_val) / (i - const_idx);
> > + if (s >= -16 && s <= 15)
> > +   {
> > + int b = const_val - s * const_idx;
> > + if (b >= -16 && b <= 15)
> > +   {
> > + base = b;
> > + step = s;
> > +   }
> > +   }
> > +   }
> > + break;
> > +   }
> > +   }
> > +
> > + if (base != 16
> > + && (!CONST_INT_P (v0)
> > + || (CONST_INT_P (v0) && INTVAL (v0) == base)))
> > +   {
> > +  

RE: [PATCH 1/2] aarch64: Improve vector constant generation using SVE INDEX instruction [PR113328]

2024-09-16 Thread Pengxuan Zheng (QUIC)
> "Pengxuan Zheng (QUIC)"  writes:
> >> On Thu, Sep 12, 2024 at 2:53 AM Pengxuan Zheng
> >>  wrote:
> >> >
> >> > SVE's INDEX instruction can be used to populate vectors by values
> >> > starting from "base" and incremented by "step" for each subsequent
> >> > value. We can take advantage of it to generate vector constants if
> >> > TARGET_SVE is available and the base and step values are within [-16,
> 15].
> >>
> >> Are there multiplication by or addition of scalar immediate
> >> instructions to enhance this with two-instruction sequences?
> >
> > No, Richard, I can't think of any equivalent two-instruction sequences.
> 
> There are some.  E.g.:
> 
>  { 16, 17, 18, 19, ... }
> 
> could be:
> 
>   index   z0.b, #0, #1
>   add z0.b, z0.b, #16
> 
> or, alternatively:
> 
>   mov w0, #16
>   index   z0.b, w0, #1
> 
> But these cases are less obviously a win, so I think it's ok to handle single
> instructions only for now.
> 
> The patch is ok for trunk, thanks, but:
> 
> >> > @@ -22991,7 +22991,7 @@ aarch64_simd_valid_immediate (rtx op,
> >> simd_immediate_info *info,
> >> >if (CONST_VECTOR_P (op)
> >> >&& CONST_VECTOR_DUPLICATE_P (op))
> >> >  n_elts = CONST_VECTOR_NPATTERNS (op);
> >> > -  else if ((vec_flags & VEC_SVE_DATA)
> >> > +  else if (which == AARCH64_CHECK_MOV && TARGET_SVE
> >> >&& const_vec_series_p (op, &base, &step))
> 
> ...the convention is to have one && condition per line if the whole expression
> doesn't fit on a single line:
> 
>   else if (which == AARCH64_CHECK_MOV
>&& TARGET_SVE
>&& const_vec_series_p (op, &base, &step))
> 

Thanks for pointing this out, Richard! I've fixed this and pushed the patch as 
r15-3669-ga92f54f580c377.

Thanks,
Pengxuan
> Richard


RE: [PATCH] aarch64: Improve vector constant generation using SVE INDEX instruction [PR113328]

2024-09-12 Thread Pengxuan Zheng (QUIC)
> > Pengxuan Zheng  writes:
> > > SVE's INDEX instruction can be used to populate vectors by values
> > > starting from "base" and incremented by "step" for each subsequent
> > > value. We can take advantage of it to generate vector constants if
> > > TARGET_SVE is available and the base and step values are within [-16, 15].
> > >
> > > For example, with the following function:
> > >
> > > typedef int v4si __attribute__ ((vector_size (16))); v4si f_v4si
> > > (void) {
> > >   return (v4si){ 0, 1, 2, 3 };
> > > }
> > >
> > > GCC currently generates:
> > >
> > > f_v4si:
> > >   adrpx0, .LC4
> > >   ldr q0, [x0, #:lo12:.LC4]
> > >   ret
> > >
> > > .LC4:
> > >   .word   0
> > >   .word   1
> > >   .word   2
> > >   .word   3
> > >
> > > With this patch, we generate an INDEX instruction instead if
> > > TARGET_SVE is available.
> > >
> > > f_v4si:
> > >   index   z0.s, #0, #1
> > >   ret
> > >
> > > [...]
> > > diff --git a/gcc/config/aarch64/aarch64.cc
> > > b/gcc/config/aarch64/aarch64.cc index 9e12bd9711c..01bfb8c52e4
> > > 100644
> > > --- a/gcc/config/aarch64/aarch64.cc
> > > +++ b/gcc/config/aarch64/aarch64.cc
> > > @@ -22960,8 +22960,7 @@ aarch64_simd_valid_immediate (rtx op,
> > simd_immediate_info *info,
> > >if (CONST_VECTOR_P (op)
> > >&& CONST_VECTOR_DUPLICATE_P (op))
> > >  n_elts = CONST_VECTOR_NPATTERNS (op);
> > > -  else if ((vec_flags & VEC_SVE_DATA)
> > > -&& const_vec_series_p (op, &base, &step))
> > > +  else if (TARGET_SVE && const_vec_series_p (op, &base, &step))
> >
> > I think we need to check which == AARCH64_CHECK_MOV too.  (Previously
> > that wasn't necessary, because native SVE only uses this routine for
> > moves.)
> >
> > FTR: I was initially a bit nervous about testing TARGET_SVE without
> > looking at vec_flags at all.  But looking at the previous handling of
> > predicates and structures, I agree it looks like the correct thing to do.
> >
> > >  {
> > >gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
> > >if (!aarch64_sve_index_immediate_p (base) [...] diff --git
> > > a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> > > b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> > > index 216699b0536..3d6a0160f95 100644
> > > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> > > @@ -10,7 +10,6 @@ dupq (int x)
> > >return svdupq_s32 (x, 1, 2, 3);
> > >  }
> > >
> > > -/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
> > > +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #1, #2} } } */
> > >  /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } }
> > > */
> > >  /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q,
> > > z[0-9]+\.q\[0\]\n} } } */
> > > -/* { dg-final { scan-assembler
> > > {\t\.word\t1\n\t\.word\t2\n\t\.word\t3\n} } } */
> >
> > This seems to be a regression of sorts.  Previously we had:
> >
> > adrpx1, .LC0
> > ldr q0, [x1, #:lo12:.LC0]
> > ins v0.s[0], w0
> > dup z0.q, z0.q[0]
> >
> > whereas now we have:
> >
> > moviv0.2s, 0x2
> > index   z31.s, #1, #2
> > ins v0.s[0], w0
> > zip1v0.4s, v0.4s, v31.4s
> > dup z0.q, z0.q[0]
> >
> > I think we should try to aim for:
> >
> > index   z0.s, #0, #1
> > ins v0.s[0], w0
> > dup z0.q, z0.q[0]
> >
> > instead.
> 
> Thanks for the feedback, Richard!
> 
> I've added support to handle vectors with non-constant elements. I've split
> that change into a separate patch. Please let me know if you have any
> comments.
> 
> [PATCH 1/2] aarch64: Improve vector constant generation using SVE INDEX
> instruction [PR113328] https://gcc.gnu.org/pipermail/gcc-patches/2024-
> September/662842.html
> 
> [PATCH 2/2] aarch64: Improve part-variable vector initialization with SVE
> INDEX instruction [PR113328] https://gcc.gnu.org/pipermail/gcc-
> patches/2024-September/662843.html

Just updated [PATCH 2/2] to fix some issue in the test cases. Here's the latest 
patch:
[PATCH v2 2/2] aarch64: Improve part-variable vector initialization with SVE 
INDEX instruction [PR113328]
https://gcc.gnu.org/pipermail/gcc-patches/2024-September/662925.html

Thanks,
Pengxuan
> 
> Thanks,
> Pengxuan
> >
> > > [...]
> > > +/*
> > > +** g_v4si:
> > > +**   index   z0\.s, #3, #\-4
> >
> > The backslash looks redundant here.
> >
> > Thanks,
> > Richard
> >
> > > +**   ret
> > > +*/
> > > +v4si
> > > +g_v4si (void)
> > > +{
> > > +  return (v4si){ 3, -1, -5, -9 };
> > > +}


RE: [PATCH 1/2] aarch64: Improve vector constant generation using SVE INDEX instruction [PR113328]

2024-09-12 Thread Pengxuan Zheng (QUIC)
> On Thu, Sep 12, 2024 at 2:53 AM Pengxuan Zheng
>  wrote:
> >
> > SVE's INDEX instruction can be used to populate vectors by values
> > starting from "base" and incremented by "step" for each subsequent
> > value. We can take advantage of it to generate vector constants if
> > TARGET_SVE is available and the base and step values are within [-16, 15].
> 
> Are there multiplication by or addition of scalar immediate instructions to
> enhance this with two-instruction sequences?

No, Richard, I can't think of any equivalent two-instruction sequences.

Thanks,
Pengxuan
> 
> > For example, with the following function:
> >
> > typedef int v4si __attribute__ ((vector_size (16))); v4si f_v4si
> > (void) {
> >   return (v4si){ 0, 1, 2, 3 };
> > }
> >
> > GCC currently generates:
> >
> > f_v4si:
> > adrpx0, .LC4
> > ldr q0, [x0, #:lo12:.LC4]
> > ret
> >
> > .LC4:
> > .word   0
> > .word   1
> > .word   2
> > .word   3
> >
> > With this patch, we generate an INDEX instruction instead if
> > TARGET_SVE is available.
> >
> > f_v4si:
> > index   z0.s, #0, #1
> > ret
> >
> > PR target/113328
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64.cc (aarch64_simd_valid_immediate):
> Improve
> > handling of some ADVSIMD vectors by using SVE's INDEX if TARGET_SVE
> is
> > available.
> > (aarch64_output_simd_mov_immediate): Likewise.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/sve/acle/general/dupq_1.c: Update test to use
> >     SVE's INDEX instruction.
> > * gcc.target/aarch64/sve/acle/general/dupq_2.c: Likewise.
> > * gcc.target/aarch64/sve/acle/general/dupq_3.c: Likewise.
> > * gcc.target/aarch64/sve/acle/general/dupq_4.c: Likewise.
> > * gcc.target/aarch64/sve/vec_init_3.c: New test.
> >
> > Signed-off-by: Pengxuan Zheng 
> > ---
> >  gcc/config/aarch64/aarch64.cc | 12 ++-
> >  .../aarch64/sve/acle/general/dupq_1.c |  3 +-
> >  .../aarch64/sve/acle/general/dupq_2.c |  3 +-
> >  .../aarch64/sve/acle/general/dupq_3.c |  3 +-
> >  .../aarch64/sve/acle/general/dupq_4.c |  3 +-
> >  .../gcc.target/aarch64/sve/vec_init_3.c   | 99 +++
> >  6 files changed, 114 insertions(+), 9 deletions(-)  create mode
> > 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c
> >
> > diff --git a/gcc/config/aarch64/aarch64.cc
> > b/gcc/config/aarch64/aarch64.cc index 27e24ba70ab..6b3ca57d0eb 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -22991,7 +22991,7 @@ aarch64_simd_valid_immediate (rtx op,
> simd_immediate_info *info,
> >if (CONST_VECTOR_P (op)
> >&& CONST_VECTOR_DUPLICATE_P (op))
> >  n_elts = CONST_VECTOR_NPATTERNS (op);
> > -  else if ((vec_flags & VEC_SVE_DATA)
> > +  else if (which == AARCH64_CHECK_MOV && TARGET_SVE
> >&& const_vec_series_p (op, &base, &step))
> >  {
> >gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT); @@
> > -25249,6 +25249,16 @@ aarch64_output_simd_mov_immediate (rtx
> > const_vector, unsigned width,
> >
> >if (which == AARCH64_CHECK_MOV)
> >  {
> > +  if (info.insn == simd_immediate_info::INDEX)
> > +   {
> > + gcc_assert (TARGET_SVE);
> > + snprintf (templ, sizeof (templ), "index\t%%Z0.%c, #"
> > +   HOST_WIDE_INT_PRINT_DEC ", #"
> HOST_WIDE_INT_PRINT_DEC,
> > +   element_char, INTVAL (info.u.index.base),
> > +   INTVAL (info.u.index.step));
> > + return templ;
> > +   }
> > +
> >mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
> >shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
> >   ? "msl" : "lsl");
> > diff --git
> > a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> > index 216699b0536..0940bedd0dd 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> > @@ -10,7 +10,6 @@ dupq (int x)
> >   

[PATCH v2 2/2] aarch64: Improve part-variable vector initialization with SVE INDEX instruction [PR113328]

2024-09-12 Thread Pengxuan Zheng
We can still use SVE's INDEX instruction to construct vectors even if not all
elements are constants. For example, { 0, x, 2, 3 } can be constructed by first
using "INDEX #0, #1" to generate { 0, 1, 2, 3 }, and then set the elements which
are non-constants separately.

PR target/113328

gcc/ChangeLog:

* config/aarch64/aarch64.cc (aarch64_expand_vector_init_fallback):
Improve part-variable vector generation with SVE's INDEX if TARGET_SVE
is available.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/acle/general/dupq_1.c: Update test to use
check-function-bodies.
* gcc.target/aarch64/sve/acle/general/dupq_2.c: Likewise.
* gcc.target/aarch64/sve/acle/general/dupq_3.c: Likewise.
* gcc.target/aarch64/sve/acle/general/dupq_4.c: Likewise.
* gcc.target/aarch64/sve/vec_init_4.c: New test.
* gcc.target/aarch64/sve/vec_init_5.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64.cc | 81 ++-
 .../aarch64/sve/acle/general/dupq_1.c | 18 -
 .../aarch64/sve/acle/general/dupq_2.c | 18 -
 .../aarch64/sve/acle/general/dupq_3.c | 18 -
 .../aarch64/sve/acle/general/dupq_4.c | 18 -
 .../gcc.target/aarch64/sve/vec_init_4.c   | 47 +++
 .../gcc.target/aarch64/sve/vec_init_5.c   | 12 +++
 7 files changed, 199 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_init_4.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 6b3ca57d0eb..7305a5c6375 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23942,12 +23942,91 @@ aarch64_expand_vector_init_fallback (rtx target, rtx 
vals)
   if (n_var != n_elts)
 {
   rtx copy = copy_rtx (vals);
+  bool is_index_seq = false;
+
+  /* If at least half of the elements of the vector are constants and all
+these constant elements form a linear sequence of the form { B, B + S,
+B + 2 * S, B + 3 * S, ... }, we can generate the vector with SVE's
+INDEX instruction if SVE is available and then set the elements which
+are not constant separately.  More precisely, each constant element I
+has to be B + I * S where B and S must be valid immediate operand for
+an SVE INDEX instruction.
+
+For example, { X, 1, 2, 3} is a vector satisfying these conditions and
+we can generate a vector of all constants (i.e., { 0, 1, 2, 3 }) first
+and then set the first element of the vector to X.  */
+
+  if (TARGET_SVE && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ && n_var <= n_elts / 2)
+   {
+ int const_idx = -1;
+ HOST_WIDE_INT const_val = 0;
+ int base = 16;
+ int step = 16;
+
+ for (int i = 0; i < n_elts; ++i)
+   {
+ rtx x = XVECEXP (vals, 0, i);
+
+ if (!CONST_INT_P (x))
+   continue;
+
+ if (const_idx == -1)
+   {
+ const_idx = i;
+ const_val = INTVAL (x);
+   }
+ else
+   {
+ if ((INTVAL (x) - const_val) % (i - const_idx) == 0)
+   {
+ HOST_WIDE_INT s
+ = (INTVAL (x) - const_val) / (i - const_idx);
+ if (s >= -16 && s <= 15)
+   {
+ int b = const_val - s * const_idx;
+ if (b >= -16 && b <= 15)
+   {
+ base = b;
+ step = s;
+   }
+   }
+   }
+ break;
+   }
+   }
+
+ if (base != 16
+ && (!CONST_INT_P (v0)
+ || (CONST_INT_P (v0) && INTVAL (v0) == base)))
+   {
+ if (!CONST_INT_P (v0))
+   XVECEXP (copy, 0, 0) = GEN_INT (base);
+
+ is_index_seq = true;
+ for (int i = 1; i < n_elts; ++i)
+   {
+ rtx x = XVECEXP (copy, 0, i);
+
+ if (CONST_INT_P (x))
+   {
+ if (INTVAL (x) != base + i * step)
+   {
+ is_index_seq = false;
+ break;
+   }
+   }
+ else
+   XVECEXP (copy, 0, i) = GEN_INT (base + i * step);
+   }
+   }
+   }
 
   /* Load constant part of vector.  We really don't care what goes into the
 parts we will overwrite, but we're more likely to be able to load the
 constant efficien

RE: [PATCH] aarch64: Improve vector constant generation using SVE INDEX instruction [PR113328]

2024-09-11 Thread Pengxuan Zheng (QUIC)
> Pengxuan Zheng  writes:
> > SVE's INDEX instruction can be used to populate vectors by values
> > starting from "base" and incremented by "step" for each subsequent
> > value. We can take advantage of it to generate vector constants if
> > TARGET_SVE is available and the base and step values are within [-16, 15].
> >
> > For example, with the following function:
> >
> > typedef int v4si __attribute__ ((vector_size (16))); v4si f_v4si
> > (void) {
> >   return (v4si){ 0, 1, 2, 3 };
> > }
> >
> > GCC currently generates:
> >
> > f_v4si:
> > adrpx0, .LC4
> > ldr q0, [x0, #:lo12:.LC4]
> > ret
> >
> > .LC4:
> > .word   0
> > .word   1
> > .word   2
> > .word   3
> >
> > With this patch, we generate an INDEX instruction instead if
> > TARGET_SVE is available.
> >
> > f_v4si:
> > index   z0.s, #0, #1
> > ret
> >
> > [...]
> > diff --git a/gcc/config/aarch64/aarch64.cc
> > b/gcc/config/aarch64/aarch64.cc index 9e12bd9711c..01bfb8c52e4 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -22960,8 +22960,7 @@ aarch64_simd_valid_immediate (rtx op,
> simd_immediate_info *info,
> >if (CONST_VECTOR_P (op)
> >&& CONST_VECTOR_DUPLICATE_P (op))
> >  n_elts = CONST_VECTOR_NPATTERNS (op);
> > -  else if ((vec_flags & VEC_SVE_DATA)
> > -  && const_vec_series_p (op, &base, &step))
> > +  else if (TARGET_SVE && const_vec_series_p (op, &base, &step))
> 
> I think we need to check which == AARCH64_CHECK_MOV too.  (Previously
> that wasn't necessary, because native SVE only uses this routine for moves.)
> 
> FTR: I was initially a bit nervous about testing TARGET_SVE without looking at
> vec_flags at all.  But looking at the previous handling of predicates and
> structures, I agree it looks like the correct thing to do.
> 
> >  {
> >gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
> >if (!aarch64_sve_index_immediate_p (base) [...] diff --git
> > a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> > index 216699b0536..3d6a0160f95 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> > @@ -10,7 +10,6 @@ dupq (int x)
> >return svdupq_s32 (x, 1, 2, 3);
> >  }
> >
> > -/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
> > +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #1, #2} } } */
> >  /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
> >  /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n}
> > } } */
> > -/* { dg-final { scan-assembler
> > {\t\.word\t1\n\t\.word\t2\n\t\.word\t3\n} } } */
> 
> This seems to be a regression of sorts.  Previously we had:
> 
> adrpx1, .LC0
> ldr q0, [x1, #:lo12:.LC0]
> ins v0.s[0], w0
> dup z0.q, z0.q[0]
> 
> whereas now we have:
> 
> moviv0.2s, 0x2
> index   z31.s, #1, #2
> ins v0.s[0], w0
> zip1v0.4s, v0.4s, v31.4s
> dup z0.q, z0.q[0]
> 
> I think we should try to aim for:
> 
> index   z0.s, #0, #1
> ins v0.s[0], w0
> dup z0.q, z0.q[0]
> 
> instead.

Thanks for the feedback, Richard!

I've added support to handle vectors with non-constant elements. I've split 
that change into a separate patch. Please let me know if you have any comments.

[PATCH 1/2] aarch64: Improve vector constant generation using SVE INDEX 
instruction [PR113328]
https://gcc.gnu.org/pipermail/gcc-patches/2024-September/662842.html

[PATCH 2/2] aarch64: Improve part-variable vector initialization with SVE INDEX 
instruction [PR113328]
https://gcc.gnu.org/pipermail/gcc-patches/2024-September/662843.html

Thanks,
Pengxuan
> 
> > [...]
> > +/*
> > +** g_v4si:
> > +** index   z0\.s, #3, #\-4
> 
> The backslash looks redundant here.
> 
> Thanks,
> Richard
> 
> > +** ret
> > +*/
> > +v4si
> > +g_v4si (void)
> > +{
> > +  return (v4si){ 3, -1, -5, -9 };
> > +}


[PATCH 2/2] aarch64: Improve part-variable vector initialization with SVE INDEX instruction [PR113328]

2024-09-11 Thread Pengxuan Zheng
We can still use SVE's INDEX instruction to construct vectors even if not all
elements are constants. For example, { 0, x, 2, 3 } can be constructed by first
using "INDEX #0, #1" to generate { 0, 1, 2, 3 }, and then set the elements which
are non-constants separately.

PR target/113328

gcc/ChangeLog:

* config/aarch64/aarch64.cc (aarch64_expand_vector_init_fallback):
Improve part-variable vector generation with SVE's INDEX if TARGET_SVE
is available.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/acle/general/dupq_1.c: Update test to use
check-function-bodies.
* gcc.target/aarch64/sve/acle/general/dupq_2.c: Likewise.
* gcc.target/aarch64/sve/acle/general/dupq_3.c: Likewise.
* gcc.target/aarch64/sve/acle/general/dupq_4.c: Likewise.
* gcc.target/aarch64/sve/vec_init_4.c: New test.
* gcc.target/aarch64/sve/vec_init_5.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64.cc | 81 ++-
 .../aarch64/sve/acle/general/dupq_1.c | 12 ++-
 .../aarch64/sve/acle/general/dupq_2.c | 12 ++-
 .../aarch64/sve/acle/general/dupq_3.c | 12 ++-
 .../aarch64/sve/acle/general/dupq_4.c | 12 ++-
 .../gcc.target/aarch64/sve/vec_init_4.c   | 47 +++
 .../gcc.target/aarch64/sve/vec_init_5.c   | 12 +++
 7 files changed, 171 insertions(+), 17 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_init_4.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 6b3ca57d0eb..7305a5c6375 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23942,12 +23942,91 @@ aarch64_expand_vector_init_fallback (rtx target, rtx 
vals)
   if (n_var != n_elts)
 {
   rtx copy = copy_rtx (vals);
+  bool is_index_seq = false;
+
+  /* If at least half of the elements of the vector are constants and all
+these constant elements form a linear sequence of the form { B, B + S,
+B + 2 * S, B + 3 * S, ... }, we can generate the vector with SVE's
+INDEX instruction if SVE is available and then set the elements which
+are not constant separately.  More precisely, each constant element I
+has to be B + I * S where B and S must be valid immediate operand for
+an SVE INDEX instruction.
+
+For example, { X, 1, 2, 3} is a vector satisfying these conditions and
+we can generate a vector of all constants (i.e., { 0, 1, 2, 3 }) first
+and then set the first element of the vector to X.  */
+
+  if (TARGET_SVE && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ && n_var <= n_elts / 2)
+   {
+ int const_idx = -1;
+ HOST_WIDE_INT const_val = 0;
+ int base = 16;
+ int step = 16;
+
+ for (int i = 0; i < n_elts; ++i)
+   {
+ rtx x = XVECEXP (vals, 0, i);
+
+ if (!CONST_INT_P (x))
+   continue;
+
+ if (const_idx == -1)
+   {
+ const_idx = i;
+ const_val = INTVAL (x);
+   }
+ else
+   {
+ if ((INTVAL (x) - const_val) % (i - const_idx) == 0)
+   {
+ HOST_WIDE_INT s
+ = (INTVAL (x) - const_val) / (i - const_idx);
+ if (s >= -16 && s <= 15)
+   {
+ int b = const_val - s * const_idx;
+ if (b >= -16 && b <= 15)
+   {
+ base = b;
+ step = s;
+   }
+   }
+   }
+ break;
+   }
+   }
+
+ if (base != 16
+ && (!CONST_INT_P (v0)
+ || (CONST_INT_P (v0) && INTVAL (v0) == base)))
+   {
+ if (!CONST_INT_P (v0))
+   XVECEXP (copy, 0, 0) = GEN_INT (base);
+
+ is_index_seq = true;
+ for (int i = 1; i < n_elts; ++i)
+   {
+ rtx x = XVECEXP (copy, 0, i);
+
+ if (CONST_INT_P (x))
+   {
+ if (INTVAL (x) != base + i * step)
+   {
+ is_index_seq = false;
+ break;
+   }
+   }
+ else
+   XVECEXP (copy, 0, i) = GEN_INT (base + i * step);
+   }
+   }
+   }
 
   /* Load constant part of vector.  We really don't care what goes into the
 parts we will overwrite, but we're more likely to be able to load the
 constant efficiently if it h

[PATCH 1/2] aarch64: Improve vector constant generation using SVE INDEX instruction [PR113328]

2024-09-11 Thread Pengxuan Zheng
SVE's INDEX instruction can be used to populate vectors by values starting from
"base" and incremented by "step" for each subsequent value. We can take
advantage of it to generate vector constants if TARGET_SVE is available and the
base and step values are within [-16, 15].

For example, with the following function:

typedef int v4si __attribute__ ((vector_size (16)));
v4si
f_v4si (void)
{
  return (v4si){ 0, 1, 2, 3 };
}

GCC currently generates:

f_v4si:
adrpx0, .LC4
ldr q0, [x0, #:lo12:.LC4]
ret

.LC4:
.word   0
.word   1
.word   2
.word   3

With this patch, we generate an INDEX instruction instead if TARGET_SVE is
available.

f_v4si:
index   z0.s, #0, #1
ret

PR target/113328

gcc/ChangeLog:

* config/aarch64/aarch64.cc (aarch64_simd_valid_immediate): Improve
handling of some ADVSIMD vectors by using SVE's INDEX if TARGET_SVE is
available.
(aarch64_output_simd_mov_immediate): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/acle/general/dupq_1.c: Update test to use
SVE's INDEX instruction.
* gcc.target/aarch64/sve/acle/general/dupq_2.c: Likewise.
* gcc.target/aarch64/sve/acle/general/dupq_3.c: Likewise.
* gcc.target/aarch64/sve/acle/general/dupq_4.c: Likewise.
* gcc.target/aarch64/sve/vec_init_3.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64.cc | 12 ++-
 .../aarch64/sve/acle/general/dupq_1.c |  3 +-
 .../aarch64/sve/acle/general/dupq_2.c |  3 +-
 .../aarch64/sve/acle/general/dupq_3.c |  3 +-
 .../aarch64/sve/acle/general/dupq_4.c |  3 +-
 .../gcc.target/aarch64/sve/vec_init_3.c   | 99 +++
 6 files changed, 114 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 27e24ba70ab..6b3ca57d0eb 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -22991,7 +22991,7 @@ aarch64_simd_valid_immediate (rtx op, 
simd_immediate_info *info,
   if (CONST_VECTOR_P (op)
   && CONST_VECTOR_DUPLICATE_P (op))
 n_elts = CONST_VECTOR_NPATTERNS (op);
-  else if ((vec_flags & VEC_SVE_DATA)
+  else if (which == AARCH64_CHECK_MOV && TARGET_SVE
   && const_vec_series_p (op, &base, &step))
 {
   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
@@ -25249,6 +25249,16 @@ aarch64_output_simd_mov_immediate (rtx const_vector, 
unsigned width,
 
   if (which == AARCH64_CHECK_MOV)
 {
+  if (info.insn == simd_immediate_info::INDEX)
+   {
+ gcc_assert (TARGET_SVE);
+ snprintf (templ, sizeof (templ), "index\t%%Z0.%c, #"
+   HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
+   element_char, INTVAL (info.u.index.base),
+   INTVAL (info.u.index.step));
+ return templ;
+   }
+
   mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
   shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
  ? "msl" : "lsl");
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
index 216699b0536..0940bedd0dd 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
@@ -10,7 +10,6 @@ dupq (int x)
   return svdupq_s32 (x, 1, 2, 3);
 }
 
-/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #0, #1} } } */
 /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
 /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
-/* { dg-final { scan-assembler {\t\.word\t1\n\t\.word\t2\n\t\.word\t3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
index d494943a275..218a6601337 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
@@ -10,7 +10,6 @@ dupq (int x)
   return svdupq_s32 (x, 1, 2, 3);
 }
 
-/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #3, #-1} } } */
 /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
 /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
-/* { dg-final { scan-assembler {\t\.word\t3\n\t\.word\t2\n\t\.word\t1\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
index 4bc8259df07..245d43b75b5 100644
--- a/gcc/tes

[PATCH] aarch64: Improve scalar mode popcount expansion by using SVE [PR113860]

2024-09-03 Thread Pengxuan Zheng
This is similar to the recent improvements to the Advanced SIMD popcount
expansion by using SVE. We can utilize SVE to generate more efficient code for
scalar mode popcount too.

PR target/113860

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (popcount2): Update pattern to
also support V1DI mode.
* config/aarch64/aarch64.md (popcount2): Add TARGET_SVE support.
* config/aarch64/iterators.md (VDQHSD_V1DI): New mode iterator.
(SVE_VDQ_I): Add V1DI.
(bitsize): Likewise.
(VPRED): Likewise.
(VEC_POP_MODE): New mode attribute.
(vec_pop_mode): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt11.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-simd.md  |  5 +-
 gcc/config/aarch64/aarch64.md   |  9 
 gcc/config/aarch64/iterators.md | 16 --
 gcc/testsuite/gcc.target/aarch64/popcnt11.c | 58 +
 4 files changed, 83 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt11.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 23c03a96371..649aeaf19ed 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3515,8 +3515,9 @@ (define_insn "popcount2"
 )
 
 (define_expand "popcount2"
-  [(set (match_operand:VDQHSD 0 "register_operand")
-   (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
+  [(set (match_operand:VDQHSD_V1DI 0 "register_operand")
+   (popcount:VDQHSD_V1DI
+ (match_operand:VDQHSD_V1DI 1 "register_operand")))]
   "TARGET_SIMD"
   {
 if (TARGET_SVE)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c54b29cd64b..ef52770f1cb 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5345,6 +5345,15 @@ (define_expand "popcount2"
(popcount:ALLI (match_operand:ALLI 1 "register_operand")))]
   "TARGET_CSSC ? GET_MODE_BITSIZE (mode) >= 32 : TARGET_SIMD"
 {
+  if (!TARGET_CSSC && TARGET_SVE && mode != QImode)
+{
+  rtx tmp = gen_reg_rtx (mode);
+  rtx op1 = gen_lowpart (mode, operands[1]);
+  emit_insn (gen_popcount2 (tmp, op1));
+  emit_move_insn (operands[0], gen_lowpart (mode, tmp));
+  DONE;
+}
+
   if (!TARGET_CSSC)
 {
   rtx v = gen_reg_rtx (V8QImode);
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 20a318e023b..84387a8119e 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -290,6 +290,8 @@ (define_mode_iterator VDQHS [V4HI V8HI V2SI V4SI])
 ;; Advanced SIMD modes for H, S and D types.
 (define_mode_iterator VDQHSD [V4HI V8HI V2SI V4SI V2DI])
 
+(define_mode_iterator VDQHSD_V1DI [VDQHSD V1DI])
+
 ;; Advanced SIMD and scalar integer modes for H and S.
 (define_mode_iterator VSDQ_HSI [V4HI V8HI V2SI V4SI HI SI])
 
@@ -560,7 +562,7 @@ (define_mode_iterator SVE_I [VNx16QI VNx8QI VNx4QI VNx2QI
 (define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI])
 
 ;; All SVE and Advanced SIMD integer vector modes.
-(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I])
+(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I V1DI])
 
 ;; SVE integer vector modes whose elements are 16 bits or wider.
 (define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
@@ -1230,7 +1232,7 @@ (define_mode_attr nunits [(V8QI "8") (V16QI "16")
 (define_mode_attr bitsize [(V8QI "64") (V16QI "128")
   (V4HI "64") (V8HI "128")
   (V2SI "64") (V4SI "128")
-  (V2DI "128")])
+  (V1DI "64") (V2DI "128")])
 
 ;; Map a floating point or integer mode to the appropriate register name prefix
 (define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") (DI "d")])
@@ -2284,7 +2286,7 @@ (define_mode_attr VPRED [(VNx16QI "VNx16BI") (VNx8QI 
"VNx8BI")
 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
 (V8QI "VNx8BI") (V16QI "VNx16BI")
 (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
-(V4SI "VNx4BI") (V2DI "VNx2BI")])
+(V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx2BI")])
 
 ;; ...and again in lower case.
 (define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi")
@@ -2318,6 +2320,14 @@ (define_mode_attr VDOUBLE [(VNx16QI "VNx32QI")
   (VNx4SI "VNx8SI") (VNx4SF "VNx8SF")
   

[PATCH] aarch64: Improve vector constant generation using SVE INDEX instruction [PR113328]

2024-08-13 Thread Pengxuan Zheng
SVE's INDEX instruction can be used to populate vectors by values starting from
"base" and incremented by "step" for each subsequent value. We can take
advantage of it to generate vector constants if TARGET_SVE is available and the
base and step values are within [-16, 15].

For example, with the following function:

typedef int v4si __attribute__ ((vector_size (16)));
v4si
f_v4si (void)
{
  return (v4si){ 0, 1, 2, 3 };
}

GCC currently generates:

f_v4si:
adrpx0, .LC4
ldr q0, [x0, #:lo12:.LC4]
ret

.LC4:
.word   0
.word   1
.word   2
.word   3

With this patch, we generate an INDEX instruction instead if TARGET_SVE is
available.

f_v4si:
index   z0.s, #0, #1
ret

PR target/113328

gcc/ChangeLog:

* config/aarch64/aarch64.cc (aarch64_simd_valid_immediate): Improve
handling of some ADVSIMD vectors by using SVE's INDEX if TARGET_SVE is
available.
(aarch64_output_simd_mov_immediate): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/acle/general/dupq_1.c: Update test to use
SVE's INDEX instruction.
* gcc.target/aarch64/sve/acle/general/dupq_2.c: Likewise.
* gcc.target/aarch64/sve/acle/general/dupq_3.c: Likewise.
* gcc.target/aarch64/sve/acle/general/dupq_4.c: Likewise.
* gcc.target/aarch64/sve/vec_init_3.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64.cc | 13 ++-
 .../aarch64/sve/acle/general/dupq_1.c |  3 +-
 .../aarch64/sve/acle/general/dupq_2.c |  3 +-
 .../aarch64/sve/acle/general/dupq_3.c |  5 +-
 .../aarch64/sve/acle/general/dupq_4.c |  5 +-
 .../gcc.target/aarch64/sve/vec_init_3.c   | 99 +++
 6 files changed, 116 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 9e12bd9711c..01bfb8c52e4 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -22960,8 +22960,7 @@ aarch64_simd_valid_immediate (rtx op, 
simd_immediate_info *info,
   if (CONST_VECTOR_P (op)
   && CONST_VECTOR_DUPLICATE_P (op))
 n_elts = CONST_VECTOR_NPATTERNS (op);
-  else if ((vec_flags & VEC_SVE_DATA)
-  && const_vec_series_p (op, &base, &step))
+  else if (TARGET_SVE && const_vec_series_p (op, &base, &step))
 {
   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
   if (!aarch64_sve_index_immediate_p (base)
@@ -25218,6 +25217,16 @@ aarch64_output_simd_mov_immediate (rtx const_vector, 
unsigned width,
 
   if (which == AARCH64_CHECK_MOV)
 {
+  if (info.insn == simd_immediate_info::INDEX)
+   {
+ gcc_assert (TARGET_SVE);
+ snprintf (templ, sizeof (templ), "index\t%%Z0.%c, #"
+   HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
+   element_char, INTVAL (info.u.index.base),
+   INTVAL (info.u.index.step));
+ return templ;
+   }
+
   mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
   shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
  ? "msl" : "lsl");
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
index 216699b0536..3d6a0160f95 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
@@ -10,7 +10,6 @@ dupq (int x)
   return svdupq_s32 (x, 1, 2, 3);
 }
 
-/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #1, #2} } } */
 /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
 /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
-/* { dg-final { scan-assembler {\t\.word\t1\n\t\.word\t2\n\t\.word\t3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
index d494943a275..c5e793a8645 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
@@ -10,7 +10,6 @@ dupq (int x)
   return svdupq_s32 (x, 1, 2, 3);
 }
 
-/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #3, #\-2} } } */
 /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
 /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
-/* { dg-final { scan-assembler {\t\.word\t3\n\t\.word\t2\n\t\.word\t1\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/

RE: [PATCH v2] aarch64: Improve Advanced SIMD popcount expansion by using SVE [PR113860]

2024-08-01 Thread Pengxuan Zheng (QUIC)
Pushed as r15-2659-ge4b8db26de352.

Pengxuan
> This patch improves the Advanced SIMD popcount expansion by using SVE if
> available.
> 
> For example, GCC currently generates the following code sequence for V2DI:
>   cnt v31.16b, v31.16b
>   uaddlp  v31.8h, v31.16b
>   uaddlp  v31.4s, v31.8h
>   uaddlp  v31.2d, v31.4s
> 
> However, by using SVE, we can generate the following sequence instead:
>   ptrue   p7.b, all
>   cnt z31.d, p7/m, z31.d
> 
> Similar improvements can be made for V4HI, V8HI, V2SI and V4SI too.
> 
> The scalar popcount expansion can also be improved similarly by using SVE
> and those changes will be included in a separate patch.
> 
>   PR target/113860
> 
> gcc/ChangeLog:
> 
>   * config/aarch64/aarch64-simd.md (popcount2): Add
> TARGET_SVE
>   support.
>   * config/aarch64/aarch64-sve.md
> (@aarch64_pred_): Use new
>   iterator SVE_VDQ_I.
>   * config/aarch64/iterators.md (SVE_VDQ_I): New mode iterator.
>   (VPRED): Add V8QI, V16QI, V4HI, V8HI and V2SI.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/aarch64/popcnt-sve.c: New test.
> 
> Signed-off-by: Pengxuan Zheng 
> ---
>  gcc/config/aarch64/aarch64-simd.md|  9 ++
>  gcc/config/aarch64/aarch64-sve.md | 13 +--
>  gcc/config/aarch64/iterators.md   |  5 ++
>  gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 88 +++
>  4 files changed, 109 insertions(+), 6 deletions(-)  create mode 100644
> gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> index bbeee221f37..895d6e5eab5 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3508,6 +3508,15 @@ (define_expand "popcount2"
>   (popcount:VDQHSD (match_operand:VDQHSD 1
> "register_operand")))]
>"TARGET_SIMD"
>{
> +if (TARGET_SVE)
> +  {
> + rtx p = aarch64_ptrue_reg (mode);
> + emit_insn (gen_aarch64_pred_popcount (operands[0],
> + p,
> + operands[1]));
> + DONE;
> +  }
> +
>  /* Generate a byte popcount.  */
>  machine_mode mode =  == 64 ? V8QImode : V16QImode;
>  rtx tmp = gen_reg_rtx (mode);
> diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-
> sve.md
> index 5331e7121d5..eb3705ae515 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -3104,16 +3104,16 @@ (define_expand "2"
> 
>  ;; Integer unary arithmetic predicated with a PTRUE.
>  (define_insn "@aarch64_pred_"
> -  [(set (match_operand:SVE_I 0 "register_operand")
> - (unspec:SVE_I
> +  [(set (match_operand:SVE_VDQ_I 0 "register_operand")
> + (unspec:SVE_VDQ_I
> [(match_operand: 1 "register_operand")
> -(SVE_INT_UNARY:SVE_I
> -  (match_operand:SVE_I 2 "register_operand"))]
> +(SVE_INT_UNARY:SVE_VDQ_I
> +  (match_operand:SVE_VDQ_I 2 "register_operand"))]
> UNSPEC_PRED_X))]
>"TARGET_SVE"
>{@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
> - [ w, Upl , 0 ; *  ] \t%0., %1/m,
> %2.
> - [ ?&w  , Upl , w ; yes] movprfx\t%0,
> %2\;\t%0., %1/m, %2.
> + [ w, Upl , 0 ; *  ] \t%Z0., 
> %1/m,
> %Z2.
> + [ ?&w  , Upl , w ; yes] movprfx\t%Z0,
> %Z2\;\t%Z0., %1/m, %Z2.
>}
>  )
> 
> @@ -3168,6 +3168,7 @@ (define_insn "*cond__any"
>}
>  )
> 
> +
>  ;; -
>  ;;  [INT] General unary arithmetic corresponding to unspecs  ;; 
> ---
> --
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index f527b2cfeb8..ee3d1fb98fd 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -559,6 +559,9 @@ (define_mode_iterator SVE_I [VNx16QI VNx8QI
> VNx4QI VNx2QI  ;; element modes  (define_mode_iterator SVE_I_SIMD_DI
> [SVE_I V2DI])
> 
> +;; All SVE and Advanced SIMD integer vector modes.
> +(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I])
> +
>  ;; SVE integer vector modes whose elements are 16 bits or wider.
>  (define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
>   VNx4SI VNx2SI
> @@ -2278,6 +2281,8 @@ (define_mode_attr VPRED [(VNx16QI "VNx16BI")
> (

RE: [PATCH] aarch64: Improve Advanced SIMD popcount expansion by using SVE [PR113860]

2024-07-31 Thread Pengxuan Zheng (QUIC)
> Sorry for the slow review.
> 
> Pengxuan Zheng  writes:
> > This patch improves the Advanced SIMD popcount expansion by using SVE
> > if available.
> >
> > For example, GCC currently generates the following code sequence for V2DI:
> >   cnt v31.16b, v31.16b
> >   uaddlp  v31.8h, v31.16b
> >   uaddlp  v31.4s, v31.8h
> >   uaddlp  v31.2d, v31.4s
> >
> > However, by using SVE, we can generate the following sequence instead:
> >   ptrue   p7.b, all
> >   cnt z31.d, p7/m, z31.d
> >
> > Similar improvements can be made for V4HI, V8HI, V2SI and V4SI too.
> >
> > The scalar popcount expansion can also be improved similarly by using
> > SVE and those changes will be included in a separate patch.
> >
> > PR target/113860
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-simd.md (popcount2): Add
> TARGET_SVE
> > support.
> > * config/aarch64/aarch64-sve.md
> (@aarch64_pred_popcount): New
> > insn.
> > * config/aarch64/iterators.md (VPRED): Add V4HI, V8HI and V2SI.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/popcnt-sve.c: New test.
> >
> > Signed-off-by: Pengxuan Zheng 
> > ---
> >  gcc/config/aarch64/aarch64-simd.md|  9 ++
> >  gcc/config/aarch64/aarch64-sve.md | 12 +++
> >  gcc/config/aarch64/iterators.md   |  1 +
> >  gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 88
> > +++
> >  4 files changed, 110 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index bbeee221f37..895d6e5eab5 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -3508,6 +3508,15 @@ (define_expand "popcount2"
> > (popcount:VDQHSD (match_operand:VDQHSD 1
> "register_operand")))]
> >"TARGET_SIMD"
> >{
> > +if (TARGET_SVE)
> > +  {
> > +   rtx p = aarch64_ptrue_reg (mode);
> > +   emit_insn (gen_aarch64_pred_popcount (operands[0],
> > +   p,
> > +   operands[1]));
> > +   DONE;
> > +  }
> > +
> >  /* Generate a byte popcount.  */
> >  machine_mode mode =  == 64 ? V8QImode : V16QImode;
> >  rtx tmp = gen_reg_rtx (mode);
> > diff --git a/gcc/config/aarch64/aarch64-sve.md
> > b/gcc/config/aarch64/aarch64-sve.md
> > index 5331e7121d5..b5021dd2da0 100644
> > --- a/gcc/config/aarch64/aarch64-sve.md
> > +++ b/gcc/config/aarch64/aarch64-sve.md
> > @@ -3168,6 +3168,18 @@ (define_insn "*cond__any"
> >}
> >  )
> >
> > +;; Popcount predicated with a PTRUE.
> > +(define_insn "@aarch64_pred_popcount"
> > +  [(set (match_operand:VDQHSD 0 "register_operand" "=w")
> > +   (unspec:VDQHSD
> > + [(match_operand: 1 "register_operand" "Upl")
> > +  (popcount:VDQHSD
> > +(match_operand:VDQHSD 2 "register_operand" "0"))]
> > + UNSPEC_PRED_X))]
> > +  "TARGET_SVE"
> > +  "cnt\t%Z0., %1/m, %Z2."
> > +)
> > +
> 
> Could you instead change:
> 
> (define_insn "@aarch64_pred_"
>   [(set (match_operand:SVE_I 0 "register_operand")
>   (unspec:SVE_I
> [(match_operand: 1 "register_operand")
>  (SVE_INT_UNARY:SVE_I
>(match_operand:SVE_I 2 "register_operand"))]
> UNSPEC_PRED_X))]
>   "TARGET_SVE"
>   {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
>  [ w, Upl , 0 ; *  ] \t%0., %1/m,
> %2.
>  [ ?&w  , Upl , w ; yes] movprfx\t%0,
> %2\;\t%0., %1/m, %2.
>   }
> )
> 
> to use a new iterator SVE_VDQ_I, defined as:
> 
> (define_mode_iterator SVE_VDQ_I [SVE_I VDQI_I])
> 
> ?  That will give the benefit of the movprfx handling and avoid code
> duplication.  It will define some patterns that are initially unused, but 
> that's
> ok.  I think the direction of travel would be to use some of the others
> eventually.
> 
> OK with that change if there are no other comments in 24 hours.

Thanks, Richard. Here's the patch updated according to your feedback.
https://gcc.gnu.org/pipermail/gcc-patches/2024-August/658929.html

I'll commit it if there's no other comments in 24 hou

[PATCH v2] aarch64: Improve Advanced SIMD popcount expansion by using SVE [PR113860]

2024-07-31 Thread Pengxuan Zheng
This patch improves the Advanced SIMD popcount expansion by using SVE if
available.

For example, GCC currently generates the following code sequence for V2DI:
  cnt v31.16b, v31.16b
  uaddlp  v31.8h, v31.16b
  uaddlp  v31.4s, v31.8h
  uaddlp  v31.2d, v31.4s

However, by using SVE, we can generate the following sequence instead:
  ptrue   p7.b, all
  cnt z31.d, p7/m, z31.d

Similar improvements can be made for V4HI, V8HI, V2SI and V4SI too.

The scalar popcount expansion can also be improved similarly by using SVE and
those changes will be included in a separate patch.

PR target/113860

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (popcount2): Add TARGET_SVE
support.
* config/aarch64/aarch64-sve.md (@aarch64_pred_): Use new
iterator SVE_VDQ_I.
* config/aarch64/iterators.md (SVE_VDQ_I): New mode iterator.
(VPRED): Add V8QI, V16QI, V4HI, V8HI and V2SI.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt-sve.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-simd.md|  9 ++
 gcc/config/aarch64/aarch64-sve.md | 13 +--
 gcc/config/aarch64/iterators.md   |  5 ++
 gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 88 +++
 4 files changed, 109 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-sve.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index bbeee221f37..895d6e5eab5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3508,6 +3508,15 @@ (define_expand "popcount2"
(popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
   "TARGET_SIMD"
   {
+if (TARGET_SVE)
+  {
+   rtx p = aarch64_ptrue_reg (mode);
+   emit_insn (gen_aarch64_pred_popcount (operands[0],
+   p,
+   operands[1]));
+   DONE;
+  }
+
 /* Generate a byte popcount.  */
 machine_mode mode =  == 64 ? V8QImode : V16QImode;
 rtx tmp = gen_reg_rtx (mode);
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 5331e7121d5..eb3705ae515 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3104,16 +3104,16 @@ (define_expand "2"
 
 ;; Integer unary arithmetic predicated with a PTRUE.
 (define_insn "@aarch64_pred_"
-  [(set (match_operand:SVE_I 0 "register_operand")
-   (unspec:SVE_I
+  [(set (match_operand:SVE_VDQ_I 0 "register_operand")
+   (unspec:SVE_VDQ_I
  [(match_operand: 1 "register_operand")
-  (SVE_INT_UNARY:SVE_I
-(match_operand:SVE_I 2 "register_operand"))]
+  (SVE_INT_UNARY:SVE_VDQ_I
+(match_operand:SVE_VDQ_I 2 "register_operand"))]
  UNSPEC_PRED_X))]
   "TARGET_SVE"
   {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
- [ w, Upl , 0 ; *  ] \t%0., %1/m, 
%2.
- [ ?&w  , Upl , w ; yes] movprfx\t%0, 
%2\;\t%0., %1/m, %2.
+ [ w, Upl , 0 ; *  ] \t%Z0., %1/m, 
%Z2.
+ [ ?&w  , Upl , w ; yes] movprfx\t%Z0, 
%Z2\;\t%Z0., %1/m, %Z2.
   }
 )
 
@@ -3168,6 +3168,7 @@ (define_insn "*cond__any"
   }
 )
 
+
 ;; -
 ;;  [INT] General unary arithmetic corresponding to unspecs
 ;; -
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index f527b2cfeb8..ee3d1fb98fd 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -559,6 +559,9 @@ (define_mode_iterator SVE_I [VNx16QI VNx8QI VNx4QI VNx2QI
 ;; element modes
 (define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI])
 
+;; All SVE and Advanced SIMD integer vector modes.
+(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I])
+
 ;; SVE integer vector modes whose elements are 16 bits or wider.
 (define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
VNx4SI VNx2SI
@@ -2278,6 +2281,8 @@ (define_mode_attr VPRED [(VNx16QI "VNx16BI") (VNx8QI 
"VNx8BI")
 (VNx32BF "VNx8BI")
 (VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
+(V8QI "VNx8BI") (V16QI "VNx16BI")
+(V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
 (V4SI "VNx4BI") (V2DI "VNx2BI")])
 
 ;; ...and again in lower case.
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c 
b/gcc/testsuite/gc

[PATCH v2] aarch64: Improve Advanced SIMD popcount expansion by using SVE [PR113860]

2024-07-31 Thread Pengxuan Zheng
This patch improves the Advanced SIMD popcount expansion by using SVE if
available.

For example, GCC currently generates the following code sequence for V2DI:
  cnt v31.16b, v31.16b
  uaddlp  v31.8h, v31.16b
  uaddlp  v31.4s, v31.8h
  uaddlp  v31.2d, v31.4s

However, by using SVE, we can generate the following sequence instead:
  ptrue   p7.b, all
  cnt z31.d, p7/m, z31.d

Similar improvements can be made for V4HI, V8HI, V2SI and V4SI too.

The scalar popcount expansion can also be improved similarly by using SVE and
those changes will be included in a separate patch.

PR target/113860

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (popcount2): Add TARGET_SVE
support.
* config/aarch64/aarch64-sve.md (@aarch64_pred_): Use new
iterator SVE_VDQ_I.
* config/aarch64/iterators.md (SVE_VDQ_I): New mode iterator.
(VPRED): Add V8QI, V16QI, V4HI, V8HI and V2SI.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt-sve.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-simd.md|  9 ++
 gcc/config/aarch64/aarch64-sve.md | 13 +--
 gcc/config/aarch64/iterators.md   |  5 ++
 gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 88 +++
 4 files changed, 109 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-sve.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index bbeee221f37..895d6e5eab5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3508,6 +3508,15 @@ (define_expand "popcount2"
(popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
   "TARGET_SIMD"
   {
+if (TARGET_SVE)
+  {
+   rtx p = aarch64_ptrue_reg (mode);
+   emit_insn (gen_aarch64_pred_popcount (operands[0],
+   p,
+   operands[1]));
+   DONE;
+  }
+
 /* Generate a byte popcount.  */
 machine_mode mode =  == 64 ? V8QImode : V16QImode;
 rtx tmp = gen_reg_rtx (mode);
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 5331e7121d5..eb3705ae515 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3104,16 +3104,16 @@ (define_expand "2"
 
 ;; Integer unary arithmetic predicated with a PTRUE.
 (define_insn "@aarch64_pred_"
-  [(set (match_operand:SVE_I 0 "register_operand")
-   (unspec:SVE_I
+  [(set (match_operand:SVE_VDQ_I 0 "register_operand")
+   (unspec:SVE_VDQ_I
  [(match_operand: 1 "register_operand")
-  (SVE_INT_UNARY:SVE_I
-(match_operand:SVE_I 2 "register_operand"))]
+  (SVE_INT_UNARY:SVE_VDQ_I
+(match_operand:SVE_VDQ_I 2 "register_operand"))]
  UNSPEC_PRED_X))]
   "TARGET_SVE"
   {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
- [ w, Upl , 0 ; *  ] \t%0., %1/m, 
%2.
- [ ?&w  , Upl , w ; yes] movprfx\t%0, 
%2\;\t%0., %1/m, %2.
+ [ w, Upl , 0 ; *  ] \t%Z0., %1/m, 
%Z2.
+ [ ?&w  , Upl , w ; yes] movprfx\t%Z0, 
%Z2\;\t%Z0., %1/m, %Z2.
   }
 )
 
@@ -3168,6 +3168,7 @@ (define_insn "*cond__any"
   }
 )
 
+
 ;; -
 ;;  [INT] General unary arithmetic corresponding to unspecs
 ;; -
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index f527b2cfeb8..ee3d1fb98fd 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -559,6 +559,9 @@ (define_mode_iterator SVE_I [VNx16QI VNx8QI VNx4QI VNx2QI
 ;; element modes
 (define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI])
 
+;; All SVE and Advanced SIMD integer vector modes.
+(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I])
+
 ;; SVE integer vector modes whose elements are 16 bits or wider.
 (define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
VNx4SI VNx2SI
@@ -2278,6 +2281,8 @@ (define_mode_attr VPRED [(VNx16QI "VNx16BI") (VNx8QI 
"VNx8BI")
 (VNx32BF "VNx8BI")
 (VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
+(V8QI "VNx8BI") (V16QI "VNx16BI")
+(V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
 (V4SI "VNx4BI") (V2DI "VNx2BI")])
 
 ;; ...and again in lower case.
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c 
b/gcc/testsuite/gc

[PATCH v2] aarch64: Improve Advanced SIMD popcount expansion by using SVE [PR113860]

2024-07-31 Thread Pengxuan Zheng
This has been approved and will be committed if there's no other comments in a
day.



[PATCH] aarch64: Improve Advanced SIMD popcount expansion by using SVE [PR113860]

2024-07-17 Thread Pengxuan Zheng
This patch improves the Advanced SIMD popcount expansion by using SVE if
available.

For example, GCC currently generates the following code sequence for V2DI:
  cnt v31.16b, v31.16b
  uaddlp  v31.8h, v31.16b
  uaddlp  v31.4s, v31.8h
  uaddlp  v31.2d, v31.4s

However, by using SVE, we can generate the following sequence instead:
  ptrue   p7.b, all
  cnt z31.d, p7/m, z31.d

Similar improvements can be made for V4HI, V8HI, V2SI and V4SI too.

The scalar popcount expansion can also be improved similarly by using SVE and
those changes will be included in a separate patch.

PR target/113860

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (popcount2): Add TARGET_SVE
support.
* config/aarch64/aarch64-sve.md (@aarch64_pred_popcount): New
insn.
* config/aarch64/iterators.md (VPRED): Add V4HI, V8HI and V2SI.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt-sve.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-simd.md|  9 ++
 gcc/config/aarch64/aarch64-sve.md | 12 +++
 gcc/config/aarch64/iterators.md   |  1 +
 gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 88 +++
 4 files changed, 110 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-sve.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index bbeee221f37..895d6e5eab5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3508,6 +3508,15 @@ (define_expand "popcount2"
(popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
   "TARGET_SIMD"
   {
+if (TARGET_SVE)
+  {
+   rtx p = aarch64_ptrue_reg (mode);
+   emit_insn (gen_aarch64_pred_popcount (operands[0],
+   p,
+   operands[1]));
+   DONE;
+  }
+
 /* Generate a byte popcount.  */
 machine_mode mode =  == 64 ? V8QImode : V16QImode;
 rtx tmp = gen_reg_rtx (mode);
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 5331e7121d5..b5021dd2da0 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3168,6 +3168,18 @@ (define_insn "*cond__any"
   }
 )
 
+;; Popcount predicated with a PTRUE.
+(define_insn "@aarch64_pred_popcount"
+  [(set (match_operand:VDQHSD 0 "register_operand" "=w")
+   (unspec:VDQHSD
+ [(match_operand: 1 "register_operand" "Upl")
+  (popcount:VDQHSD
+(match_operand:VDQHSD 2 "register_operand" "0"))]
+ UNSPEC_PRED_X))]
+  "TARGET_SVE"
+  "cnt\t%Z0., %1/m, %Z2."
+)
+
 ;; -
 ;;  [INT] General unary arithmetic corresponding to unspecs
 ;; -
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index f527b2cfeb8..a06159b23ea 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -2278,6 +2278,7 @@ (define_mode_attr VPRED [(VNx16QI "VNx16BI") (VNx8QI 
"VNx8BI")
 (VNx32BF "VNx8BI")
 (VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
+(V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
 (V4SI "VNx4BI") (V2DI "VNx2BI")])
 
 ;; ...and again in lower case.
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
new file mode 100644
index 000..8e349efe390
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
@@ -0,0 +1,88 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+sve -fno-vect-cost-model 
-fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** f_v4hi:
+** ptrue   (p[0-7]).b, all
+** ldr d([0-9]+), \[x0\]
+** cnt z\2.h, \1/m, z\2.h
+** str d\2, \[x1\]
+** ret
+*/
+void
+f_v4hi (unsigned short *__restrict b, unsigned short *__restrict d)
+{
+  d[0] = __builtin_popcount (b[0]);
+  d[1] = __builtin_popcount (b[1]);
+  d[2] = __builtin_popcount (b[2]);
+  d[3] = __builtin_popcount (b[3]);
+}
+
+/*
+** f_v8hi:
+** ptrue   (p[0-7]).b, all
+** ldr q([0-9]+), \[x0\]
+** cnt z\2.h, \1/m, z\2.h
+** str q\2, \[x1\]
+** ret
+*/
+void
+f_v8hi (unsigned short *__restrict b, unsigned short *__restrict d)
+{
+  d[0] = __builtin_popcount (b[0]);
+  d[1] = __builtin_popcount (b[1]);
+  d[2]

RE: [PATCH v9] aarch64: Add vector popcount besides QImode [PR113859]

2024-07-02 Thread Pengxuan Zheng (QUIC)
> Pengxuan Zheng  writes:
> > This patch improves GCC’s vectorization of __builtin_popcount for
> > aarch64 target by adding popcount patterns for vector modes besides
> > QImode, i.e., HImode, SImode and DImode.
> >
> > With this patch, we now generate the following for V8HI:
> >   cnt v1.16b, v0.16b
> >   uaddlp  v2.8h, v1.16b
> >
> > For V4HI, we generate:
> >   cnt v1.8b, v0.8b
> >   uaddlp  v2.4h, v1.8b
> >
> > For V4SI, we generate:
> >   cnt v1.16b, v0.16b
> >   uaddlp  v2.8h, v1.16b
> >   uaddlp  v3.4s, v2.8h
> >
> > For V4SI with TARGET_DOTPROD, we generate the following instead:
> >   moviv0.4s, #0
> >   moviv1.16b, #1
> >   cnt v3.16b, v2.16b
> >   udotv0.4s, v3.16b, v1.16b
> >
> > For V2SI, we generate:
> >   cnt v1.8b, v.8b
> >   uaddlp  v2.4h, v1.8b
> >   uaddlp  v3.2s, v2.4h
> >
> > For V2SI with TARGET_DOTPROD, we generate the following instead:
> >   moviv0.8b, #0
> >   moviv1.8b, #1
> >   cnt v3.8b, v2.8b
> >   udotv0.2s, v3.8b, v1.8b
> >
> > For V2DI, we generate:
> >   cnt v1.16b, v.16b
> >   uaddlp  v2.8h, v1.16b
> >   uaddlp  v3.4s, v2.8h
> >   uaddlp  v4.2d, v3.4s
> >
> > For V4SI with TARGET_DOTPROD, we generate the following instead:
> >   moviv0.4s, #0
> >   moviv1.16b, #1
> >   cnt v3.16b, v2.16b
> >   udotv0.4s, v3.16b, v1.16b
> >   uaddlp  v0.2d, v0.4s
> >
> > PR target/113859
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-simd.md (aarch64_addlp):
> Rename to...
> > (@aarch64_addlp): ... This.
> >     (popcount2): New define_expand.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/popcnt-udot.c: New test.
> > * gcc.target/aarch64/popcnt-vec.c: New test.
> 
> OK, thanks, and sorry for the slow review.
> 
> Richard

Thanks, Richard. Pushed as r15-1801-g895bbc08d38c2a.

Pengxuan
> 
> >
> > Signed-off-by: Pengxuan Zheng 
> > ---
> >  gcc/config/aarch64/aarch64-simd.md| 41 ++-
> >  .../gcc.target/aarch64/popcnt-udot.c  | 58 
> >  gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 69
> > +++
> >  3 files changed, 167 insertions(+), 1 deletion(-)  create mode 100644
> > gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index 01b084d8ccb..fd0c5e612b5 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -3461,7 +3461,7 @@ (define_insn
> "*aarch64_addlv_ze"
> >[(set_attr "type" "neon_reduc_add")]
> >  )
> >
> > -(define_expand "aarch64_addlp"
> > +(define_expand "@aarch64_addlp"
> >[(set (match_operand: 0 "register_operand")
> > (plus:
> >   (vec_select:
> > @@ -3517,6 +3517,45 @@ (define_insn
> "popcount2"
> >[(set_attr "type" "neon_cnt")]
> >  )
> >
> > +(define_expand "popcount2"
> > +  [(set (match_operand:VDQHSD 0 "register_operand")
> > +   (popcount:VDQHSD (match_operand:VDQHSD 1
> "register_operand")))]
> > +  "TARGET_SIMD"
> > +  {
> > +/* Generate a byte popcount.  */
> > +machine_mode mode =  == 64 ? V8QImode : V16QImode;
> > +rtx tmp = gen_reg_rtx (mode);
> > +auto icode = optab_handler (popcount_optab, mode);
> > +emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode,
> > +operands[1])));
> > +
> > +if (TARGET_DOTPROD
> > +   && (mode == SImode || mode == DImode))
> > +  {
> > +   /* For V4SI and V2SI, we can generate a UDOT with a 0 accumulator
> and a
> > +  1 multiplicand.  For V2DI, another UAADDLP is needed.  */
> > +   rtx ones = force_reg (mode, CONST1_RTX (mode));
> > +   auto icode = optab_handler (udot_prod_optab, mode);
> > +   mode =  == 64 ? V2SImode : V4SImode;
> > +   rtx dest = mode == mode ? operands[0] : gen_reg_rtx
> (mode);
> > +   rtx zeros = force_reg (mode, CONST0_RTX (mode));
> > +   emit_insn (GEN_FCN (icode) (dest, tmp, ones, zeros));
> > +   tmp = dest;
> > +  }
> > +
> > +/* Use a sequence of UADDLPs to accumulate the counts.  Each step
> doubles
> > + 

RE: [PATCH v6] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-28 Thread Pengxuan Zheng (QUIC)
> > On 6/28/24 6:18 AM, Pengxuan Zheng wrote:
> > > This patch improves GCC’s vectorization of __builtin_popcount for
> > > aarch64 target by adding popcount patterns for vector modes besides
> > > QImode, i.e., HImode, SImode and DImode.
> > >
> > > With this patch, we now generate the following for V8HI:
> > >cnt v1.16b, v0.16b
> > >uaddlp  v2.8h, v1.16b
> > >
> > > For V4HI, we generate:
> > >cnt v1.8b, v0.8b
> > >uaddlp  v2.4h, v1.8b
> > >
> > > For V4SI, we generate:
> > >cnt v1.16b, v0.16b
> > >uaddlp  v2.8h, v1.16b
> > >uaddlp  v3.4s, v2.8h
> > >
> > > For V4SI with TARGET_DOTPROD, we generate the following instead:
> > >moviv0.4s, #0
> > >moviv1.16b, #1
> > >cnt v3.16b, v2.16b
> > >udotv0.4s, v3.16b, v1.16b
> > >
> > > For V2SI, we generate:
> > >cnt v1.8b, v.8b
> > >uaddlp  v2.4h, v1.8b
> > >uaddlp  v3.2s, v2.4h
> > >
> > > For V2SI with TARGET_DOTPROD, we generate the following instead:
> > >moviv0.8b, #0
> > >moviv1.8b, #1
> > >cnt v3.8b, v2.8b
> > >udotv0.2s, v3.8b, v1.8b
> > >
> > > For V2DI, we generate:
> > >cnt v1.16b, v.16b
> > >uaddlp  v2.8h, v1.16b
> > >uaddlp  v3.4s, v2.8h
> > >uaddlp  v4.2d, v3.4s
> > >
> > > For V4SI with TARGET_DOTPROD, we generate the following instead:
> > >moviv0.4s, #0
> > >moviv1.16b, #1
> > >cnt v3.16b, v2.16b
> > >udotv0.4s, v3.16b, v1.16b
> > >uaddlp  v0.2d, v0.4s
> > >
> > >   PR target/113859
> > >
> > > gcc/ChangeLog:
> > >
> > >   * config/aarch64/aarch64-simd.md (aarch64_addlp):
> > Rename to...
> > >   (@aarch64_addlp): ... This.
> > >   (popcount2): New define_expand.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >   * gcc.target/aarch64/popcnt-udot.c: New test.
> > >   * gcc.target/aarch64/popcnt-vec.c: New test.
> > >
> > > Signed-off-by: Pengxuan Zheng 
> > > ---
> > >   gcc/config/aarch64/aarch64-simd.md| 41 ++-
> > >   .../gcc.target/aarch64/popcnt-udot.c  | 58 
> > >   gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 69
> +++
> > >   3 files changed, 167 insertions(+), 1 deletion(-)
> > >   create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
> > >   create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> > >
> > > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > > b/gcc/config/aarch64/aarch64-simd.md
> > > index 01b084d8ccb..afdf3ec7873 100644
> > > --- a/gcc/config/aarch64/aarch64-simd.md
> > > +++ b/gcc/config/aarch64/aarch64-simd.md
> > > @@ -3461,7 +3461,7 @@ (define_insn
> > "*aarch64_addlv_ze"
> > > [(set_attr "type" "neon_reduc_add")]
> > >   )
> > >
> > > -(define_expand "aarch64_addlp"
> > > +(define_expand "@aarch64_addlp"
> > > [(set (match_operand: 0 "register_operand")
> > >   (plus:
> > > (vec_select:
> > > @@ -3517,6 +3517,45 @@ (define_insn
> > "popcount2"
> > > [(set_attr "type" "neon_cnt")]
> > >   )
> > >
> > > +(define_expand "popcount2"
> > > +  [(set (match_operand:VDQHSD 0 "register_operand")
> > > +(popcount:VDQHSD (match_operand:VDQHSD 1
> > > +"register_operand")))]
> > > +  "TARGET_SIMD"
> > > +  {
> > > +/* Generate a byte popcount. */
> >
> > A couple of formatting nits. Two spaces before end of comment.
> 
> I noticed this in other places, but didn't realize it's intentional. Glad you
> pointed this out!
> 
> >
> > > +machine_mode mode =  == 64 ? V8QImode : V16QImode;
> > > +rtx tmp = gen_reg_rtx (mode);
> > > +auto icode = optab_handler (popcount_optab, mode);
> > > +emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode,
> > > + operands[1])));
> > > +
> > > +if (TARGET_DOTPROD
> > > +&& (mode == SImode || mode == DImode))
> > > +  {
> > > +/* For V4SI and 

[PATCH v9] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-28 Thread Pengxuan Zheng
This patch improves GCC’s vectorization of __builtin_popcount for aarch64 target
by adding popcount patterns for vector modes besides QImode, i.e., HImode,
SImode and DImode.

With this patch, we now generate the following for V8HI:
  cnt v1.16b, v0.16b
  uaddlp  v2.8h, v1.16b

For V4HI, we generate:
  cnt v1.8b, v0.8b
  uaddlp  v2.4h, v1.8b

For V4SI, we generate:
  cnt v1.16b, v0.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h

For V4SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.4s, #0
  moviv1.16b, #1
  cnt v3.16b, v2.16b
  udotv0.4s, v3.16b, v1.16b

For V2SI, we generate:
  cnt v1.8b, v.8b
  uaddlp  v2.4h, v1.8b
  uaddlp  v3.2s, v2.4h

For V2SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.8b, #0
  moviv1.8b, #1
  cnt v3.8b, v2.8b
  udotv0.2s, v3.8b, v1.8b

For V2DI, we generate:
  cnt v1.16b, v.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h
  uaddlp  v4.2d, v3.4s

For V4SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.4s, #0
  moviv1.16b, #1
  cnt v3.16b, v2.16b
  udotv0.4s, v3.16b, v1.16b
  uaddlp  v0.2d, v0.4s

PR target/113859

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (aarch64_addlp): Rename to...
(@aarch64_addlp): ... This.
(popcount2): New define_expand.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt-udot.c: New test.
* gcc.target/aarch64/popcnt-vec.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-simd.md| 41 ++-
 .../gcc.target/aarch64/popcnt-udot.c  | 58 
 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 69 +++
 3 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 01b084d8ccb..fd0c5e612b5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3461,7 +3461,7 @@ (define_insn 
"*aarch64_addlv_ze"
   [(set_attr "type" "neon_reduc_add")]
 )
 
-(define_expand "aarch64_addlp"
+(define_expand "@aarch64_addlp"
   [(set (match_operand: 0 "register_operand")
(plus:
  (vec_select:
@@ -3517,6 +3517,45 @@ (define_insn "popcount2"
   [(set_attr "type" "neon_cnt")]
 )
 
+(define_expand "popcount2"
+  [(set (match_operand:VDQHSD 0 "register_operand")
+   (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
+  "TARGET_SIMD"
+  {
+/* Generate a byte popcount.  */
+machine_mode mode =  == 64 ? V8QImode : V16QImode;
+rtx tmp = gen_reg_rtx (mode);
+auto icode = optab_handler (popcount_optab, mode);
+emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode, operands[1])));
+
+if (TARGET_DOTPROD
+   && (mode == SImode || mode == DImode))
+  {
+   /* For V4SI and V2SI, we can generate a UDOT with a 0 accumulator and a
+  1 multiplicand.  For V2DI, another UAADDLP is needed.  */
+   rtx ones = force_reg (mode, CONST1_RTX (mode));
+   auto icode = optab_handler (udot_prod_optab, mode);
+   mode =  == 64 ? V2SImode : V4SImode;
+   rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
+   rtx zeros = force_reg (mode, CONST0_RTX (mode));
+   emit_insn (GEN_FCN (icode) (dest, tmp, ones, zeros));
+   tmp = dest;
+  }
+
+/* Use a sequence of UADDLPs to accumulate the counts.  Each step doubles
+   the element size and halves the number of elements.  */
+while (mode != mode)
+  {
+   auto icode = code_for_aarch64_addlp (ZERO_EXTEND, GET_MODE (tmp));
+   mode = insn_data[icode].operand[0].mode;
+   rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
+   emit_insn (GEN_FCN (icode) (dest, tmp));
+   tmp = dest;
+  }
+DONE;
+  }
+)
+
 ;; 'across lanes' max and min ops.
 
 ;; Template for outputting a scalar, so we can create __builtins which can be
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
new file mode 100644
index 000..f6a968dae95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+dotprod -fno-vect-cost-model 
-fno-schedule-insns -fno-schedule-insns2" } */
+
+/*
+** bar:
+** moviv([0-9]+).16b, 0x1
+** moviv([0-9]+).4s, 0
+** ldr q([0-9]+), \[x0\]
+** cnt v([0-9]+).16b, v\3.16b
+** udotv\2.4s, v\4.16b, v\1.16b
+** str q\2, \[x1\]
+** ret
+*/
+void
+bar (unsigned int *__restrict b, unsigned int *__restrict d)
+{
+  d[0] = __builtin_popcount (b[0]);
+  d[

RE: [PATCH v6] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-28 Thread Pengxuan Zheng (QUIC)
> On 6/28/24 6:18 AM, Pengxuan Zheng wrote:
> > This patch improves GCC’s vectorization of __builtin_popcount for
> > aarch64 target by adding popcount patterns for vector modes besides
> > QImode, i.e., HImode, SImode and DImode.
> >
> > With this patch, we now generate the following for V8HI:
> >cnt v1.16b, v0.16b
> >uaddlp  v2.8h, v1.16b
> >
> > For V4HI, we generate:
> >cnt v1.8b, v0.8b
> >uaddlp  v2.4h, v1.8b
> >
> > For V4SI, we generate:
> >cnt v1.16b, v0.16b
> >uaddlp  v2.8h, v1.16b
> >uaddlp  v3.4s, v2.8h
> >
> > For V4SI with TARGET_DOTPROD, we generate the following instead:
> >moviv0.4s, #0
> >moviv1.16b, #1
> >cnt v3.16b, v2.16b
> >udotv0.4s, v3.16b, v1.16b
> >
> > For V2SI, we generate:
> >cnt v1.8b, v.8b
> >uaddlp  v2.4h, v1.8b
> >uaddlp  v3.2s, v2.4h
> >
> > For V2SI with TARGET_DOTPROD, we generate the following instead:
> >moviv0.8b, #0
> >moviv1.8b, #1
> >cnt v3.8b, v2.8b
> >udotv0.2s, v3.8b, v1.8b
> >
> > For V2DI, we generate:
> >cnt v1.16b, v.16b
> >uaddlp  v2.8h, v1.16b
> >uaddlp  v3.4s, v2.8h
> >uaddlp  v4.2d, v3.4s
> >
> > For V4SI with TARGET_DOTPROD, we generate the following instead:
> >moviv0.4s, #0
> >moviv1.16b, #1
> >cnt v3.16b, v2.16b
> >udotv0.4s, v3.16b, v1.16b
> >uaddlp  v0.2d, v0.4s
> >
> > PR target/113859
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-simd.md (aarch64_addlp):
> Rename to...
> > (@aarch64_addlp): ... This.
> > (popcount2): New define_expand.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/popcnt-udot.c: New test.
> > * gcc.target/aarch64/popcnt-vec.c: New test.
> >
> > Signed-off-by: Pengxuan Zheng 
> > ---
> >   gcc/config/aarch64/aarch64-simd.md| 41 ++-
> >   .../gcc.target/aarch64/popcnt-udot.c  | 58 
> >   gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 69 +++
> >   3 files changed, 167 insertions(+), 1 deletion(-)
> >   create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
> >   create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index 01b084d8ccb..afdf3ec7873 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -3461,7 +3461,7 @@ (define_insn
> "*aarch64_addlv_ze"
> > [(set_attr "type" "neon_reduc_add")]
> >   )
> >
> > -(define_expand "aarch64_addlp"
> > +(define_expand "@aarch64_addlp"
> > [(set (match_operand: 0 "register_operand")
> > (plus:
> >   (vec_select:
> > @@ -3517,6 +3517,45 @@ (define_insn
> "popcount2"
> > [(set_attr "type" "neon_cnt")]
> >   )
> >
> > +(define_expand "popcount2"
> > +  [(set (match_operand:VDQHSD 0 "register_operand")
> > +(popcount:VDQHSD (match_operand:VDQHSD 1
> > +"register_operand")))]
> > +  "TARGET_SIMD"
> > +  {
> > +/* Generate a byte popcount. */
> 
> A couple of formatting nits. Two spaces before end of comment.

I noticed this in other places, but didn't realize it's intentional. Glad you 
pointed this out!

> 
> > +machine_mode mode =  == 64 ? V8QImode : V16QImode;
> > +rtx tmp = gen_reg_rtx (mode);
> > +auto icode = optab_handler (popcount_optab, mode);
> > +emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode,
> > + operands[1])));
> > +
> > +if (TARGET_DOTPROD
> > +&& (mode == SImode || mode == DImode))
> > +  {
> > +/* For V4SI and V2SI, we can generate a UDOT with a 0 accumulator
> and a
> > +   1 multiplicand. For V2DI, another UAADDLP is needed. */
> 
> Likewise.
> 
> > +rtx ones = force_reg (mode, CONST1_RTX (mode));
> > +auto icode = optab_handler (udot_prod_optab, mode);
> > +mode =  == 64 ? V2SImode : V4SImode;
> > +rtx dest = mode == mode ? operands[0] : gen_reg_rtx
> (mode);
> > +rtx zeros = force_reg (mode, CONST0_RTX (mode));
> > +emit_insn (GEN_FCN (icode) (dest, tmp, 

[PATCH v8] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-28 Thread Pengxuan Zheng
This patch improves GCC’s vectorization of __builtin_popcount for aarch64 target
by adding popcount patterns for vector modes besides QImode, i.e., HImode,
SImode and DImode.

With this patch, we now generate the following for V8HI:
  cnt v1.16b, v0.16b
  uaddlp  v2.8h, v1.16b

For V4HI, we generate:
  cnt v1.8b, v0.8b
  uaddlp  v2.4h, v1.8b

For V4SI, we generate:
  cnt v1.16b, v0.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h

For V4SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.4s, #0
  moviv1.16b, #1
  cnt v3.16b, v2.16b
  udotv0.4s, v3.16b, v1.16b

For V2SI, we generate:
  cnt v1.8b, v.8b
  uaddlp  v2.4h, v1.8b
  uaddlp  v3.2s, v2.4h

For V2SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.8b, #0
  moviv1.8b, #1
  cnt v3.8b, v2.8b
  udotv0.2s, v3.8b, v1.8b

For V2DI, we generate:
  cnt v1.16b, v.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h
  uaddlp  v4.2d, v3.4s

For V4SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.4s, #0
  moviv1.16b, #1
  cnt v3.16b, v2.16b
  udotv0.4s, v3.16b, v1.16b
  uaddlp  v0.2d, v0.4s

PR target/113859

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (aarch64_addlp): Rename to...
(@aarch64_addlp): ... This.
(popcount2): New define_expand.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt-udot.c: New test.
* gcc.target/aarch64/popcnt-vec.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-simd.md| 41 ++-
 .../gcc.target/aarch64/popcnt-udot.c  | 58 
 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 69 +++
 3 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 01b084d8ccb..04c97d076a9 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3461,7 +3461,7 @@ (define_insn 
"*aarch64_addlv_ze"
   [(set_attr "type" "neon_reduc_add")]
 )
 
-(define_expand "aarch64_addlp"
+(define_expand "@aarch64_addlp"
   [(set (match_operand: 0 "register_operand")
(plus:
  (vec_select:
@@ -3517,6 +3517,45 @@ (define_insn "popcount2"
   [(set_attr "type" "neon_cnt")]
 )
 
+(define_expand "popcount2"
+  [(set (match_operand:VDQHSD 0 "register_operand")
+(popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
+  "TARGET_SIMD"
+  {
+/* Generate a byte popcount.  */
+machine_mode mode =  == 64 ? V8QImode : V16QImode;
+rtx tmp = gen_reg_rtx (mode);
+auto icode = optab_handler (popcount_optab, mode);
+emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode, operands[1])));
+
+if (TARGET_DOTPROD
+&& (mode == SImode || mode == DImode))
+  {
+/* For V4SI and V2SI, we can generate a UDOT with a 0 accumulator and a
+   1 multiplicand.  For V2DI, another UAADDLP is needed.  */
+rtx ones = force_reg (mode, CONST1_RTX (mode));
+auto icode = optab_handler (udot_prod_optab, mode);
+mode =  == 64 ? V2SImode : V4SImode;
+rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
+rtx zeros = force_reg (mode, CONST0_RTX (mode));
+emit_insn (GEN_FCN (icode) (dest, tmp, ones, zeros));
+tmp = dest;
+  }
+
+/* Use a sequence of UADDLPs to accumulate the counts.  Each step doubles
+   the element size and halves the number of elements.  */
+while (mode != mode)
+  {
+auto icode = code_for_aarch64_addlp (ZERO_EXTEND, GET_MODE (tmp));
+mode = insn_data[icode].operand[0].mode;
+rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
+emit_insn (GEN_FCN (icode) (dest, tmp));
+tmp = dest;
+  }
+DONE;
+  }
+)
+
 ;; 'across lanes' max and min ops.
 
 ;; Template for outputting a scalar, so we can create __builtins which can be
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
new file mode 100644
index 000..f6a968dae95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+dotprod -fno-vect-cost-model 
-fno-schedule-insns -fno-schedule-insns2" } */
+
+/*
+** bar:
+** moviv([0-9]+).16b, 0x1
+** moviv([0-9]+).4s, 0
+** ldr q([0-9]+), \[x0\]
+** cnt v([0-9]+).16b, v\3.16b
+** udotv\2.4s, v\4.16b, v\1.16b
+** str q\2, \[x1\]
+** ret
+*/
+void
+bar (unsigned int *__restrict b, unsigned int *__restrict d)
+{
+  d[0] = __builtin_popcount (b

RE: [PATCH v7] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-28 Thread Pengxuan Zheng (QUIC)
Please ignore this patch. I accidently added unrelated changes. I'll push a 
correct version shortly.

Sorry for the noise.

Thanks,
Pengxuan
> This patch improves GCC’s vectorization of __builtin_popcount for aarch64
> target by adding popcount patterns for vector modes besides QImode, i.e.,
> HImode, SImode and DImode.
> 
> With this patch, we now generate the following for V8HI:
>   cnt v1.16b, v0.16b
>   uaddlp  v2.8h, v1.16b
> 
> For V4HI, we generate:
>   cnt v1.8b, v0.8b
>   uaddlp  v2.4h, v1.8b
> 
> For V4SI, we generate:
>   cnt v1.16b, v0.16b
>   uaddlp  v2.8h, v1.16b
>   uaddlp  v3.4s, v2.8h
> 
> For V4SI with TARGET_DOTPROD, we generate the following instead:
>   moviv0.4s, #0
>   moviv1.16b, #1
>   cnt v3.16b, v2.16b
>   udotv0.4s, v3.16b, v1.16b
> 
> For V2SI, we generate:
>   cnt v1.8b, v.8b
>   uaddlp  v2.4h, v1.8b
>   uaddlp  v3.2s, v2.4h
> 
> For V2SI with TARGET_DOTPROD, we generate the following instead:
>   moviv0.8b, #0
>   moviv1.8b, #1
>   cnt v3.8b, v2.8b
>   udotv0.2s, v3.8b, v1.8b
> 
> For V2DI, we generate:
>   cnt v1.16b, v.16b
>   uaddlp  v2.8h, v1.16b
>   uaddlp  v3.4s, v2.8h
>   uaddlp  v4.2d, v3.4s
> 
> For V4SI with TARGET_DOTPROD, we generate the following instead:
>   moviv0.4s, #0
>   moviv1.16b, #1
>   cnt v3.16b, v2.16b
>   udotv0.4s, v3.16b, v1.16b
>   uaddlp  v0.2d, v0.4s
> 
>   PR target/113859
> 
> gcc/ChangeLog:
> 
>   * config/aarch64/aarch64-simd.md (aarch64_addlp):
> Rename to...
>   (@aarch64_addlp): ... This.
>   (popcount2): New define_expand.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/aarch64/popcnt-udot.c: New test.
>   * gcc.target/aarch64/popcnt-vec.c: New test.
> 
> Signed-off-by: Pengxuan Zheng 
> ---
>  gcc/config/aarch64/aarch64-simd.md| 41 ++-
>  .../gcc.target/aarch64/popcnt-udot.c  | 58 
>  gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 69 +++
>  3 files changed, 167 insertions(+), 1 deletion(-)  create mode 100644
> gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> index 01b084d8ccb..04c97d076a9 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3461,7 +3461,7 @@ (define_insn
> "*aarch64_addlv_ze"
>[(set_attr "type" "neon_reduc_add")]
>  )
> 
> -(define_expand "aarch64_addlp"
> +(define_expand "@aarch64_addlp"
>[(set (match_operand: 0 "register_operand")
>   (plus:
> (vec_select:
> @@ -3517,6 +3517,45 @@ (define_insn "popcount2"
>[(set_attr "type" "neon_cnt")]
>  )
> 
> +(define_expand "popcount2"
> +  [(set (match_operand:VDQHSD 0 "register_operand")
> +(popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
> +  "TARGET_SIMD"
> +  {
> +/* Generate a byte popcount.  */
> +machine_mode mode =  == 64 ? V8QImode : V16QImode;
> +rtx tmp = gen_reg_rtx (mode);
> +auto icode = optab_handler (popcount_optab, mode);
> +emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode, operands[1])));
> +
> +if (TARGET_DOTPROD
> +&& (mode == SImode || mode == DImode))
> +  {
> +/* For V4SI and V2SI, we can generate a UDOT with a 0 accumulator and
> a
> +   1 multiplicand.  For V2DI, another UAADDLP is needed.  */
> +rtx ones = force_reg (mode, CONST1_RTX (mode));
> +auto icode = optab_handler (udot_prod_optab, mode);
> +mode =  == 64 ? V2SImode : V4SImode;
> +rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
> +rtx zeros = force_reg (mode, CONST0_RTX (mode));
> +emit_insn (GEN_FCN (icode) (dest, tmp, ones, zeros));
> +tmp = dest;
> +  }
> +
> +/* Use a sequence of UADDLPs to accumulate the counts.  Each step
> doubles
> +   the element size and halves the number of elements.  */
> +while (mode != mode)
> +  {
> +auto icode = code_for_aarch64_addlp (ZERO_EXTEND, GET_MODE
> (tmp));
> +mode = insn_data[icode].operand[0].mode;
> +rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
> +emit_insn (GEN_FCN (icode) (dest, tmp));
> +tmp = dest;
> +  }
> +DONE;
> +  }
> +)
> +
>  ;; 'across lanes' max and min ops.
> 
>  ;; Template for 

[PATCH v7] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-28 Thread Pengxuan Zheng
This patch improves GCC’s vectorization of __builtin_popcount for aarch64 target
by adding popcount patterns for vector modes besides QImode, i.e., HImode,
SImode and DImode.

With this patch, we now generate the following for V8HI:
  cnt v1.16b, v0.16b
  uaddlp  v2.8h, v1.16b

For V4HI, we generate:
  cnt v1.8b, v0.8b
  uaddlp  v2.4h, v1.8b

For V4SI, we generate:
  cnt v1.16b, v0.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h

For V4SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.4s, #0
  moviv1.16b, #1
  cnt v3.16b, v2.16b
  udotv0.4s, v3.16b, v1.16b

For V2SI, we generate:
  cnt v1.8b, v.8b
  uaddlp  v2.4h, v1.8b
  uaddlp  v3.2s, v2.4h

For V2SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.8b, #0
  moviv1.8b, #1
  cnt v3.8b, v2.8b
  udotv0.2s, v3.8b, v1.8b

For V2DI, we generate:
  cnt v1.16b, v.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h
  uaddlp  v4.2d, v3.4s

For V4SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.4s, #0
  moviv1.16b, #1
  cnt v3.16b, v2.16b
  udotv0.4s, v3.16b, v1.16b
  uaddlp  v0.2d, v0.4s

PR target/113859

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (aarch64_addlp): Rename to...
(@aarch64_addlp): ... This.
(popcount2): New define_expand.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt-udot.c: New test.
* gcc.target/aarch64/popcnt-vec.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-simd.md| 41 ++-
 .../gcc.target/aarch64/popcnt-udot.c  | 58 
 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 69 +++
 3 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 01b084d8ccb..04c97d076a9 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3461,7 +3461,7 @@ (define_insn 
"*aarch64_addlv_ze"
   [(set_attr "type" "neon_reduc_add")]
 )
 
-(define_expand "aarch64_addlp"
+(define_expand "@aarch64_addlp"
   [(set (match_operand: 0 "register_operand")
(plus:
  (vec_select:
@@ -3517,6 +3517,45 @@ (define_insn "popcount2"
   [(set_attr "type" "neon_cnt")]
 )
 
+(define_expand "popcount2"
+  [(set (match_operand:VDQHSD 0 "register_operand")
+(popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
+  "TARGET_SIMD"
+  {
+/* Generate a byte popcount.  */
+machine_mode mode =  == 64 ? V8QImode : V16QImode;
+rtx tmp = gen_reg_rtx (mode);
+auto icode = optab_handler (popcount_optab, mode);
+emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode, operands[1])));
+
+if (TARGET_DOTPROD
+&& (mode == SImode || mode == DImode))
+  {
+/* For V4SI and V2SI, we can generate a UDOT with a 0 accumulator and a
+   1 multiplicand.  For V2DI, another UAADDLP is needed.  */
+rtx ones = force_reg (mode, CONST1_RTX (mode));
+auto icode = optab_handler (udot_prod_optab, mode);
+mode =  == 64 ? V2SImode : V4SImode;
+rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
+rtx zeros = force_reg (mode, CONST0_RTX (mode));
+emit_insn (GEN_FCN (icode) (dest, tmp, ones, zeros));
+tmp = dest;
+  }
+
+/* Use a sequence of UADDLPs to accumulate the counts.  Each step doubles
+   the element size and halves the number of elements.  */
+while (mode != mode)
+  {
+auto icode = code_for_aarch64_addlp (ZERO_EXTEND, GET_MODE (tmp));
+mode = insn_data[icode].operand[0].mode;
+rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
+emit_insn (GEN_FCN (icode) (dest, tmp));
+tmp = dest;
+  }
+DONE;
+  }
+)
+
 ;; 'across lanes' max and min ops.
 
 ;; Template for outputting a scalar, so we can create __builtins which can be
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
new file mode 100644
index 000..f6a968dae95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+dotprod -fno-vect-cost-model 
-fno-schedule-insns -fno-schedule-insns2" } */
+
+/*
+** bar:
+** moviv([0-9]+).16b, 0x1
+** moviv([0-9]+).4s, 0
+** ldr q([0-9]+), \[x0\]
+** cnt v([0-9]+).16b, v\3.16b
+** udotv\2.4s, v\4.16b, v\1.16b
+** str q\2, \[x1\]
+** ret
+*/
+void
+bar (unsigned int *__restrict b, unsigned int *__restrict d)
+{
+  d[0] = __builtin_popcount (b

RE: [PATCH v5] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-27 Thread Pengxuan Zheng (QUIC)
Thanks, Richard! I've updated the patch accordingly.

https://gcc.gnu.org/pipermail/gcc-patches/2024-June/655912.html

Please let me know if any other changes are needed.

Thanks,
Pengxuan
> Sorry for the slow reply.
> 
> Pengxuan Zheng  writes:
> > This patch improves GCC’s vectorization of __builtin_popcount for
> > aarch64 target by adding popcount patterns for vector modes besides
> > QImode, i.e., HImode, SImode and DImode.
> >
> > With this patch, we now generate the following for V8HI:
> >   cnt v1.16b, v0.16b
> >   uaddlp  v2.8h, v1.16b
> >
> > For V4HI, we generate:
> >   cnt v1.8b, v0.8b
> >   uaddlp  v2.4h, v1.8b
> >
> > For V4SI, we generate:
> >   cnt v1.16b, v0.16b
> >   uaddlp  v2.8h, v1.16b
> >   uaddlp  v3.4s, v2.8h
> >
> > For V4SI with TARGET_DOTPROD, we generate the following instead:
> >   moviv0.4s, #0
> >   moviv1.16b, #1
> >   cnt v3.16b, v2.16b
> >   udotv0.4s, v3.16b, v1.16b
> >
> > For V2SI, we generate:
> >   cnt v1.8b, v.8b
> >   uaddlp  v2.4h, v1.8b
> >   uaddlp  v3.2s, v2.4h
> >
> > For V2SI with TARGET_DOTPROD, we generate the following instead:
> >   moviv0.8b, #0
> >   moviv1.8b, #1
> >   cnt v3.8b, v2.8b
> >   udotv0.2s, v3.8b, v1.8b
> >
> > For V2DI, we generate:
> >   cnt v1.16b, v.16b
> >   uaddlp  v2.8h, v1.16b
> >   uaddlp  v3.4s, v2.8h
> >   uaddlp  v4.2d, v3.4s
> >
> > For V4SI with TARGET_DOTPROD, we generate the following instead:
> >   moviv0.4s, #0
> >   moviv1.16b, #1
> >   cnt v3.16b, v2.16b
> >   udotv0.4s, v3.16b, v1.16b
> >   uaddlp  v0.2d, v0.4s
> >
> > PR target/113859
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-simd.md (aarch64_addlp):
> Rename to...
> > (@aarch64_addlp): ... This.
> > (popcount2): New define_expand.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/popcnt-udot.c: New test.
> > * gcc.target/aarch64/popcnt-vec.c: New test.
> >
> > Signed-off-by: Pengxuan Zheng 
> > ---
> >  gcc/config/aarch64/aarch64-simd.md| 51 +-
> >  .../gcc.target/aarch64/popcnt-udot.c  | 58 
> >  gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 69
> > +++
> >  3 files changed, 177 insertions(+), 1 deletion(-)  create mode 100644
> > gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index 0bb39091a38..1c76123a518 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -3461,7 +3461,7 @@ (define_insn
> "*aarch64_addlv_ze"
> >[(set_attr "type" "neon_reduc_add")]
> >  )
> >
> > -(define_expand "aarch64_addlp"
> > +(define_expand "@aarch64_addlp"
> >[(set (match_operand: 0 "register_operand")
> > (plus:
> >   (vec_select:
> > @@ -3517,6 +3517,55 @@ (define_insn
> "popcount2"
> >[(set_attr "type" "neon_cnt")]
> >  )
> >
> > +(define_expand "popcount2"
> > +  [(set (match_operand:VDQHSD 0 "register_operand")
> > +(popcount:VDQHSD (match_operand:VDQHSD 1
> > +"register_operand")))]
> > +  "TARGET_SIMD"
> > +  {
> > +/* Generate a byte popcount. */
> > +machine_mode mode =  == 64 ? V8QImode : V16QImode;
> > +rtx tmp = gen_reg_rtx (mode);
> > +auto icode = optab_handler (popcount_optab, mode);
> > +emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode,
> > +operands[1])));
> > +
> > +if (TARGET_DOTPROD)
> > +  {
> > +/* For V4SI and V2SI, we can generate a UDOT with a 0 accumulator
> and a
> > +   1 multiplicand. For V2DI, another UAADDLP is needed. */
> > +if (mode == SImode || mode == DImode)
> 
> How about combining these into a single if:
> 
>   if (TARGET_DOTPROD
>   && (mode == SImode || mode == DImode))
> 
> > +  {
> > +machine_mode dp_mode =  == 64 ? V2SImode : V4SImode;
> > +rtx ones = force_reg (mode, CONST1_RTX (mode));
> > +rtx zeros = CONST0_RTX (dp_mode);
> > +rtx dp = gen_reg_rtx (dp_mode);
> &g

[PATCH v6] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-27 Thread Pengxuan Zheng
This patch improves GCC’s vectorization of __builtin_popcount for aarch64 target
by adding popcount patterns for vector modes besides QImode, i.e., HImode,
SImode and DImode.

With this patch, we now generate the following for V8HI:
  cnt v1.16b, v0.16b
  uaddlp  v2.8h, v1.16b

For V4HI, we generate:
  cnt v1.8b, v0.8b
  uaddlp  v2.4h, v1.8b

For V4SI, we generate:
  cnt v1.16b, v0.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h

For V4SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.4s, #0
  moviv1.16b, #1
  cnt v3.16b, v2.16b
  udotv0.4s, v3.16b, v1.16b

For V2SI, we generate:
  cnt v1.8b, v.8b
  uaddlp  v2.4h, v1.8b
  uaddlp  v3.2s, v2.4h

For V2SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.8b, #0
  moviv1.8b, #1
  cnt v3.8b, v2.8b
  udotv0.2s, v3.8b, v1.8b

For V2DI, we generate:
  cnt v1.16b, v.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h
  uaddlp  v4.2d, v3.4s

For V4SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.4s, #0
  moviv1.16b, #1
  cnt v3.16b, v2.16b
  udotv0.4s, v3.16b, v1.16b
  uaddlp  v0.2d, v0.4s

PR target/113859

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (aarch64_addlp): Rename to...
(@aarch64_addlp): ... This.
(popcount2): New define_expand.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt-udot.c: New test.
* gcc.target/aarch64/popcnt-vec.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-simd.md| 41 ++-
 .../gcc.target/aarch64/popcnt-udot.c  | 58 
 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 69 +++
 3 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 01b084d8ccb..afdf3ec7873 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3461,7 +3461,7 @@ (define_insn 
"*aarch64_addlv_ze"
   [(set_attr "type" "neon_reduc_add")]
 )
 
-(define_expand "aarch64_addlp"
+(define_expand "@aarch64_addlp"
   [(set (match_operand: 0 "register_operand")
(plus:
  (vec_select:
@@ -3517,6 +3517,45 @@ (define_insn "popcount2"
   [(set_attr "type" "neon_cnt")]
 )
 
+(define_expand "popcount2"
+  [(set (match_operand:VDQHSD 0 "register_operand")
+(popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
+  "TARGET_SIMD"
+  {
+/* Generate a byte popcount. */
+machine_mode mode =  == 64 ? V8QImode : V16QImode;
+rtx tmp = gen_reg_rtx (mode);
+auto icode = optab_handler (popcount_optab, mode);
+emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode, operands[1])));
+
+if (TARGET_DOTPROD
+&& (mode == SImode || mode == DImode))
+  {
+/* For V4SI and V2SI, we can generate a UDOT with a 0 accumulator and a
+   1 multiplicand. For V2DI, another UAADDLP is needed. */
+rtx ones = force_reg (mode, CONST1_RTX (mode));
+auto icode = optab_handler (udot_prod_optab, mode);
+mode =  == 64 ? V2SImode : V4SImode;
+rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
+rtx zeros = force_reg (mode, CONST0_RTX (mode));
+emit_insn (GEN_FCN (icode) (dest, tmp, ones, zeros));
+tmp = dest;
+  }
+
+/* Use a sequence of UADDLPs to accumulate the counts. Each step doubles
+   the element size and halves the number of elements. */
+while (mode != mode)
+  {
+auto icode = code_for_aarch64_addlp (ZERO_EXTEND, GET_MODE (tmp));
+mode = insn_data[icode].operand[0].mode;
+rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
+emit_insn (GEN_FCN (icode) (dest, tmp));
+tmp = dest;
+  }
+DONE;
+  }
+)
+
 ;; 'across lanes' max and min ops.
 
 ;; Template for outputting a scalar, so we can create __builtins which can be
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
new file mode 100644
index 000..f6a968dae95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+dotprod -fno-vect-cost-model 
-fno-schedule-insns -fno-schedule-insns2" } */
+
+/*
+** bar:
+** moviv([0-9]+).16b, 0x1
+** moviv([0-9]+).4s, 0
+** ldr q([0-9]+), \[x0\]
+** cnt v([0-9]+).16b, v\3.16b
+** udotv\2.4s, v\4.16b, v\1.16b
+** str q\2, \[x1\]
+** ret
+*/
+void
+bar (unsigned int *__restrict b, unsigned int *__restrict d)
+{
+  d[0] = __builtin_popcount (b[0]);

RE: [PATCH v4] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-18 Thread Pengxuan Zheng (QUIC)
> On Mon, Jun 17, 2024 at 11:25 PM Pengxuan Zheng
>  wrote:
> >
> > This patch improves GCC’s vectorization of __builtin_popcount for
> > aarch64 target by adding popcount patterns for vector modes besides
> > QImode, i.e., HImode, SImode and DImode.
> >
> > With this patch, we now generate the following for V8HI:
> >   cnt v1.16b, v0.16b
> >   uaddlp  v2.8h, v1.16b
> >
> > For V4HI, we generate:
> >   cnt v1.8b, v0.8b
> >   uaddlp  v2.4h, v1.8b
> >
> > For V4SI, we generate:
> >   cnt v1.16b, v0.16b
> >   uaddlp  v2.8h, v1.16b
> >   uaddlp  v3.4s, v2.8h
> >
> > For V4SI with TARGET_DOTPROD, we generate the following instead:
> >   moviv0.4s, #0
> >   moviv1.16b, #1
> >   cnt v3.16b, v2.16b
> >   udotv0.4s, v3.16b, v1.16b
> >
> > For V2SI, we generate:
> >   cnt v1.8b, v.8b
> >   uaddlp  v2.4h, v1.8b
> >   uaddlp  v3.2s, v2.4h
> >
> > For V2SI with TARGET_DOTPROD, we generate the following instead:
> >   moviv0.8b, #0
> >   moviv1.8b, #1
> >   cnt v3.8b, v2.8b
> >   udotv0.2s, v3.8b, v1.8b
> >
> > For V2DI, we generate:
> >   cnt v1.16b, v.16b
> >   uaddlp  v2.8h, v1.16b
> >   uaddlp  v3.4s, v2.8h
> >   uaddlp  v4.2d, v3.4s
> >
> > For V4SI with TARGET_DOTPROD, we generate the following instead:
> >   moviv0.4s, #0
> >   moviv1.16b, #1
> >   cnt v3.16b, v2.16b
> >   udotv0.4s, v3.16b, v1.16b
> >   uaddlp  v0.2d, v0.4s
> >
> > PR target/113859
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-simd.md (aarch64_addlp):
> Rename to...
> > (@aarch64_addlp): ... This.
> > (popcount2): New define_expand.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/popcnt-udot.c: New test.
> > * gcc.target/aarch64/popcnt-vec.c: New test.
> >
> > Signed-off-by: Pengxuan Zheng 
> > ---
> >  gcc/config/aarch64/aarch64-simd.md| 52 +-
> >  .../gcc.target/aarch64/popcnt-udot.c  | 45 
> >  gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 69
> > +++
> >  3 files changed, 165 insertions(+), 1 deletion(-)  create mode 100644
> > gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index 0bb39091a38..3bdd4400408 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -3461,7 +3461,7 @@ (define_insn
> "*aarch64_addlv_ze"
> >[(set_attr "type" "neon_reduc_add")]
> >  )
> >
> > -(define_expand "aarch64_addlp"
> > +(define_expand "@aarch64_addlp"
> >[(set (match_operand: 0 "register_operand")
> > (plus:
> >   (vec_select:
> > @@ -3517,6 +3517,56 @@ (define_insn
> "popcount2"
> >[(set_attr "type" "neon_cnt")]
> >  )
> >
> > +(define_expand "popcount2"
> > +  [(set (match_operand:VDQHSD 0 "register_operand")
> > +(popcount:VDQHSD (match_operand:VDQHSD 1
> > +"register_operand")))]
> > +  "TARGET_SIMD"
> > +  {
> > +/* Generate a byte popcount. */
> > +machine_mode mode =  == 64 ? V8QImode : V16QImode;
> > +rtx tmp = gen_reg_rtx (mode);
> > +auto icode = optab_handler (popcount_optab, mode);
> > +emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode,
> > +operands[1])));
> > +
> > +if (TARGET_DOTPROD)
> > +  {
> > +/* For V4SI and V2SI, we can generate a UDOT with a 0 accumulator
> and a
> > +   1 multiplicant. For V2DI, another UAADDLP is needed. */
> > +if (mode == V4SImode || mode == V2SImode
> > +|| mode == V2DImode)
> 
> I think the above simplified/modified to just `mode == SImode ||
> mode == DImode`.
> Also s/multiplicant/multiplicand/ .

Thanks, Andrew! I have updated the patch accordingly.

https://gcc.gnu.org/pipermail/gcc-patches/2024-June/655020.html
> 
> > +  {
> > +machine_mode dp_mode =  == 64 ? V2SImode : V4SImode;
> > +rtx ones = force_reg (mode, CONST1_RTX (mode));
> > +rtx zeros = CONST0_RTX (dp_mode);
> > +rtx dp = gen_reg_rtx (dp_mode);

[PATCH v5] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-18 Thread Pengxuan Zheng
This patch improves GCC’s vectorization of __builtin_popcount for aarch64 target
by adding popcount patterns for vector modes besides QImode, i.e., HImode,
SImode and DImode.

With this patch, we now generate the following for V8HI:
  cnt v1.16b, v0.16b
  uaddlp  v2.8h, v1.16b

For V4HI, we generate:
  cnt v1.8b, v0.8b
  uaddlp  v2.4h, v1.8b

For V4SI, we generate:
  cnt v1.16b, v0.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h

For V4SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.4s, #0
  moviv1.16b, #1
  cnt v3.16b, v2.16b
  udotv0.4s, v3.16b, v1.16b

For V2SI, we generate:
  cnt v1.8b, v.8b
  uaddlp  v2.4h, v1.8b
  uaddlp  v3.2s, v2.4h

For V2SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.8b, #0
  moviv1.8b, #1
  cnt v3.8b, v2.8b
  udotv0.2s, v3.8b, v1.8b

For V2DI, we generate:
  cnt v1.16b, v.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h
  uaddlp  v4.2d, v3.4s

For V4SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.4s, #0
  moviv1.16b, #1
  cnt v3.16b, v2.16b
  udotv0.4s, v3.16b, v1.16b
  uaddlp  v0.2d, v0.4s

PR target/113859

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (aarch64_addlp): Rename to...
(@aarch64_addlp): ... This.
(popcount2): New define_expand.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt-udot.c: New test.
* gcc.target/aarch64/popcnt-vec.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-simd.md| 51 +-
 .../gcc.target/aarch64/popcnt-udot.c  | 58 
 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 69 +++
 3 files changed, 177 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 0bb39091a38..1c76123a518 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3461,7 +3461,7 @@ (define_insn 
"*aarch64_addlv_ze"
   [(set_attr "type" "neon_reduc_add")]
 )
 
-(define_expand "aarch64_addlp"
+(define_expand "@aarch64_addlp"
   [(set (match_operand: 0 "register_operand")
(plus:
  (vec_select:
@@ -3517,6 +3517,55 @@ (define_insn "popcount2"
   [(set_attr "type" "neon_cnt")]
 )
 
+(define_expand "popcount2"
+  [(set (match_operand:VDQHSD 0 "register_operand")
+(popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
+  "TARGET_SIMD"
+  {
+/* Generate a byte popcount. */
+machine_mode mode =  == 64 ? V8QImode : V16QImode;
+rtx tmp = gen_reg_rtx (mode);
+auto icode = optab_handler (popcount_optab, mode);
+emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode, operands[1])));
+
+if (TARGET_DOTPROD)
+  {
+/* For V4SI and V2SI, we can generate a UDOT with a 0 accumulator and a
+   1 multiplicand. For V2DI, another UAADDLP is needed. */
+if (mode == SImode || mode == DImode)
+  {
+machine_mode dp_mode =  == 64 ? V2SImode : V4SImode;
+rtx ones = force_reg (mode, CONST1_RTX (mode));
+rtx zeros = CONST0_RTX (dp_mode);
+rtx dp = gen_reg_rtx (dp_mode);
+auto dp_icode = optab_handler (udot_prod_optab, mode);
+emit_move_insn (dp, zeros);
+emit_insn (GEN_FCN (dp_icode) (dp, tmp, ones, dp));
+if (mode == V2DImode)
+  {
+emit_insn (gen_aarch64_uaddlpv4si (operands[0], dp));
+DONE;
+  }
+emit_move_insn (operands[0], dp);
+DONE;
+  }
+  }
+
+/* Use a sequence of UADDLPs to accumulate the counts. Each step doubles
+   the element size and halves the number of elements. */
+do
+  {
+auto icode = code_for_aarch64_addlp (ZERO_EXTEND, GET_MODE (tmp));
+mode = insn_data[icode].operand[0].mode;
+rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
+emit_insn (GEN_FCN (icode) (dest, tmp));
+tmp = dest;
+  }
+while (mode != mode);
+DONE;
+  }
+)
+
 ;; 'across lanes' max and min ops.
 
 ;; Template for outputting a scalar, so we can create __builtins which can be
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
new file mode 100644
index 000..150ff746361
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+dotprod -fno-vect-cost-model" } */
+
+/*
+** bar:
+** ldr q([0-9]+), \[x0\]
+** moviv([0-9]+).16b, 0x1
+** moviv([0-9]+).4s, 

RE: [PATCH v3] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-17 Thread Pengxuan Zheng (QUIC)
> Hi,
> 
> > -Original Message-
> > From: Pengxuan Zheng 
> > Sent: Friday, June 14, 2024 12:57 AM
> > To: gcc-patches@gcc.gnu.org
> > Cc: Pengxuan Zheng 
> > Subject: [PATCH v3] aarch64: Add vector popcount besides QImode
> > [PR113859]
> >
> > This patch improves GCC’s vectorization of __builtin_popcount for
> > aarch64 target by adding popcount patterns for vector modes besides
> > QImode, i.e., HImode, SImode and DImode.
> >
> > With this patch, we now generate the following for V8HI:
> >   cnt v1.16b, v.16b
> >   uaddlp  v2.8h, v1.16b
> >
> > For V4HI, we generate:
> >   cnt v1.8b, v.8b
> >   uaddlp  v2.4h, v1.8b
> >
> > For V4SI, we generate:
> >   cnt v1.16b, v.16b
> >   uaddlp  v2.8h, v1.16b
> >   uaddlp  v3.4s, v2.8h
> >
> > For V2SI, we generate:
> >   cnt v1.8b, v.8b
> >   uaddlp  v2.4h, v1.8b
> >   uaddlp  v3.2s, v2.4h
> >
> > For V2DI, we generate:
> >   cnt v1.16b, v.16b
> >   uaddlp  v2.8h, v1.16b
> >   uaddlp  v3.4s, v2.8h
> >   uaddlp  v4.2d, v3.4s
> 
> Nice patch!  We can do better for these sequences though. Would you
> instead consider using udot with a 0 accumulator and 1 multiplicatent.
> 
> Essentially
> movi v0.16b, #0
> movi v1.16b, #1
> cnt v3.16b, v2.16b
> udot  v0.4s, v3.16b, v1.16b
> 
> this has 1 instruction less on the critical path so should be half the 
> latency of
> the uaddlp variants.
> 
> For the DI case you'll still need a final uaddlp.

Thanks for your suggestions, Tamar! That's indeed more efficient. I have 
updated 
the patch accordingly. Please let me know if you have any other comments.

https://gcc.gnu.org/pipermail/gcc-patches/2024-June/654947.html

Thanks,
Pengxuan
> 
> Cheers,
> Tamar
> 
> >
> > PR target/113859
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-simd.md (aarch64_addlp):
> > Rename to...
> > (@aarch64_addlp): ... This.
> > (popcount2): New define_expand.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/popcnt-vec.c: New test.
> >
> > Signed-off-by: Pengxuan Zheng 
> > ---
> >  gcc/config/aarch64/aarch64-simd.md| 28 +++-
> >  gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 69
> > +++
> >  2 files changed, 96 insertions(+), 1 deletion(-)  create mode 100644
> > gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64- simd.md index 0bb39091a38..ee73e13534b
> > 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -3461,7 +3461,7 @@ (define_insn
> > "*aarch64_addlv_ze"
> >[(set_attr "type" "neon_reduc_add")]
> >  )
> >
> > -(define_expand "aarch64_addlp"
> > +(define_expand "@aarch64_addlp"
> >[(set (match_operand: 0 "register_operand")
> > (plus:
> >   (vec_select:
> > @@ -3517,6 +3517,32 @@ (define_insn
> "popcount2"
> >[(set_attr "type" "neon_cnt")]
> >  )
> >
> > +(define_expand "popcount2"
> > +  [(set (match_operand:VDQHSD 0 "register_operand")
> > +(popcount:VDQHSD (match_operand:VDQHSD 1
> > +"register_operand")))]
> > +  "TARGET_SIMD"
> > +  {
> > +/* Generate a byte popcount. */
> > +machine_mode mode =  == 64 ? V8QImode : V16QImode;
> > +rtx tmp = gen_reg_rtx (mode);
> > +auto icode = optab_handler (popcount_optab, mode);
> > +emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode,
> > +operands[1])));
> > +
> > +/* Use a sequence of UADDLPs to accumulate the counts. Each step
> doubles
> > +   the element size and halves the number of elements. */
> > +do
> > +  {
> > +auto icode = code_for_aarch64_addlp (ZERO_EXTEND, GET_MODE
> (tmp));
> > +mode = insn_data[icode].operand[0].mode;
> > +rtx dest = mode == mode ? operands[0] : gen_reg_rtx
> (mode);
> > +emit_insn (GEN_FCN (icode) (dest, tmp));
> > +tmp = dest;
> > +  }
> > +while (mode != mode);
> > +DONE;
> > +  }
> > +)
> > +
> >  ;; 'across lanes' max and min ops.
> >
> >  ;; Template for outputting a scalar, so we can create __builtins
> > which can be diff --git
> > a/gcc/testsuite/

[PATCH v4] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-17 Thread Pengxuan Zheng
This patch improves GCC’s vectorization of __builtin_popcount for aarch64 target
by adding popcount patterns for vector modes besides QImode, i.e., HImode,
SImode and DImode.

With this patch, we now generate the following for V8HI:
  cnt v1.16b, v0.16b
  uaddlp  v2.8h, v1.16b

For V4HI, we generate:
  cnt v1.8b, v0.8b
  uaddlp  v2.4h, v1.8b

For V4SI, we generate:
  cnt v1.16b, v0.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h

For V4SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.4s, #0
  moviv1.16b, #1
  cnt v3.16b, v2.16b
  udotv0.4s, v3.16b, v1.16b

For V2SI, we generate:
  cnt v1.8b, v.8b
  uaddlp  v2.4h, v1.8b
  uaddlp  v3.2s, v2.4h

For V2SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.8b, #0
  moviv1.8b, #1
  cnt v3.8b, v2.8b
  udotv0.2s, v3.8b, v1.8b

For V2DI, we generate:
  cnt v1.16b, v.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h
  uaddlp  v4.2d, v3.4s

For V4SI with TARGET_DOTPROD, we generate the following instead:
  moviv0.4s, #0
  moviv1.16b, #1
  cnt v3.16b, v2.16b
  udotv0.4s, v3.16b, v1.16b
  uaddlp  v0.2d, v0.4s

PR target/113859

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (aarch64_addlp): Rename to...
(@aarch64_addlp): ... This.
(popcount2): New define_expand.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt-udot.c: New test.
* gcc.target/aarch64/popcnt-vec.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-simd.md| 52 +-
 .../gcc.target/aarch64/popcnt-udot.c  | 45 
 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 69 +++
 3 files changed, 165 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 0bb39091a38..3bdd4400408 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3461,7 +3461,7 @@ (define_insn 
"*aarch64_addlv_ze"
   [(set_attr "type" "neon_reduc_add")]
 )
 
-(define_expand "aarch64_addlp"
+(define_expand "@aarch64_addlp"
   [(set (match_operand: 0 "register_operand")
(plus:
  (vec_select:
@@ -3517,6 +3517,56 @@ (define_insn "popcount2"
   [(set_attr "type" "neon_cnt")]
 )
 
+(define_expand "popcount2"
+  [(set (match_operand:VDQHSD 0 "register_operand")
+(popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
+  "TARGET_SIMD"
+  {
+/* Generate a byte popcount. */
+machine_mode mode =  == 64 ? V8QImode : V16QImode;
+rtx tmp = gen_reg_rtx (mode);
+auto icode = optab_handler (popcount_optab, mode);
+emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode, operands[1])));
+
+if (TARGET_DOTPROD)
+  {
+/* For V4SI and V2SI, we can generate a UDOT with a 0 accumulator and a
+   1 multiplicant. For V2DI, another UAADDLP is needed. */
+if (mode == V4SImode || mode == V2SImode
+|| mode == V2DImode)
+  {
+machine_mode dp_mode =  == 64 ? V2SImode : V4SImode;
+rtx ones = force_reg (mode, CONST1_RTX (mode));
+rtx zeros = CONST0_RTX (dp_mode);
+rtx dp = gen_reg_rtx (dp_mode);
+auto dp_icode = optab_handler (udot_prod_optab, mode);
+emit_move_insn (dp, zeros);
+emit_insn (GEN_FCN (dp_icode) (dp, tmp, ones, dp));
+if (mode == V2DImode)
+  {
+emit_insn (gen_aarch64_uaddlpv4si (operands[0], dp));
+DONE;
+  }
+emit_move_insn (operands[0], dp);
+DONE;
+  }
+  }
+
+/* Use a sequence of UADDLPs to accumulate the counts. Each step doubles
+   the element size and halves the number of elements. */
+do
+  {
+auto icode = code_for_aarch64_addlp (ZERO_EXTEND, GET_MODE (tmp));
+mode = insn_data[icode].operand[0].mode;
+rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
+emit_insn (GEN_FCN (icode) (dest, tmp));
+tmp = dest;
+  }
+while (mode != mode);
+DONE;
+  }
+)
+
 ;; 'across lanes' max and min ops.
 
 ;; Template for outputting a scalar, so we can create __builtins which can be
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
new file mode 100644
index 000..759d34a2d10
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+dotprod -fno-vect-cost-model" } */
+
+/* This function should produce cnt v.16b and udot. */
+void
+bar (unsigned in

RE: [PATCH] aarch64: Add fix_truncv4sfv4hi2 pattern [PR113882]

2024-06-17 Thread Pengxuan Zheng (QUIC)
> Pengxuan Zheng  writes:
> > This patch adds the fix_truncv4sfv4hi2 (V4SF->V4HI) pattern which is
> > implemented using fix_truncv4sfv4si2 (V4SF->V4SI) and then truncv4siv4hi2
> (V4SI->V4HI).
> >
> > PR target/113882
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-simd.md (fix_truncv4sfv4hi2): New pattern.
> 
> Could we handle this by extending the target-independent code instead?
> Richard mentioned in comment 1 that the current set of intermediate
> conversions is hard-coded, but it didn't sound like he was implying that the
> set shouldn't change.

Yes, Richard. I checked the target-independent code. In fact, SLP already 
handles this type of intermediate conversions. However, the logic is guarded by 
"!flag_trapping_math". Therefore, if we pass -fno-trapping-math , SLP actually 
generates the right vectorized code. Also, looks like the check for 
"!flag_trapping_math" was added intentionally in r14-2085-g77a50c772771f6 to 
fix 
some PRs. So, I'm not sure what we should do here. Thoughts?

  if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
  && (code == FLOAT_EXPR ||
  (code == FIX_TRUNC_EXPR && !flag_trapping_math)))

Thanks,
Pengxuan
> 
> Thanks,
> Richard
> 
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/fix_trunc2.c: New test.
> >
> > Signed-off-by: Pengxuan Zheng 
> > ---
> >  gcc/config/aarch64/aarch64-simd.md| 13 +
> >  gcc/testsuite/gcc.target/aarch64/fix_trunc2.c | 14 ++
> >  2 files changed, 27 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/fix_trunc2.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index 868f4486218..096f7b56a27 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -3032,6 +3032,19 @@ (define_expand
> "2"
> >"TARGET_SIMD"
> >{})
> >
> > +
> > +(define_expand "fix_truncv4sfv4hi2"
> > +  [(match_operand:V4HI 0 "register_operand")
> > +   (match_operand:V4SF 1 "register_operand")]
> > +  "TARGET_SIMD"
> > +  {
> > +rtx tmp = gen_reg_rtx (V4SImode);
> > +emit_insn (gen_fix_truncv4sfv4si2 (tmp, operands[1]));
> > +emit_insn (gen_truncv4siv4hi2 (operands[0], tmp));
> > +DONE;
> > +  }
> > +)
> > +
> >  (define_expand "ftrunc2"
> >[(set (match_operand:VHSDF 0 "register_operand")
> > (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")] diff
> > --git a/gcc/testsuite/gcc.target/aarch64/fix_trunc2.c
> > b/gcc/testsuite/gcc.target/aarch64/fix_trunc2.c
> > new file mode 100644
> > index 000..57cc00913a3
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/fix_trunc2.c
> > @@ -0,0 +1,14 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +
> > +void
> > +f (short *__restrict a, float *__restrict b) {
> > +  a[0] = b[0];
> > +  a[1] = b[1];
> > +  a[2] = b[2];
> > +  a[3] = b[3];
> > +}
> > +
> > +/* { dg-final { scan-assembler-times {fcvtzs\tv[0-9]+.4s, v[0-9]+.4s}
> > +1 } } */
> > +/* { dg-final { scan-assembler-times {xtn\tv[0-9]+.4h, v[0-9]+.4s} 1
> > +} } */


RE: [PATCH v2] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-13 Thread Pengxuan Zheng (QUIC)
> Pengxuan Zheng  writes:
> > This patch improves GCC’s vectorization of __builtin_popcount for
> > aarch64 target by adding popcount patterns for vector modes besides
> > QImode, i.e., HImode, SImode and DImode.
> >
> > With this patch, we now generate the following for V8HI:
> >   cnt v1.16b, v.16b
> >   uaddlp  v2.8h, v1.16b
> >
> > For V4HI, we generate:
> >   cnt v1.8b, v.8b
> >   uaddlp  v2.4h, v1.8b
> >
> > For V4SI, we generate:
> >   cnt v1.16b, v.16b
> >   uaddlp  v2.8h, v1.16b
> >   uaddlp  v3.4s, v2.8h
> >
> > For V2SI, we generate:
> >   cnt v1.8b, v.8b
> >   uaddlp  v2.4h, v1.8b
> >   uaddlp  v3.2s, v2.4h
> >
> > For V2DI, we generate:
> >   cnt v1.16b, v.16b
> >   uaddlp  v2.8h, v1.16b
> >   uaddlp  v3.4s, v2.8h
> >   uaddlp  v4.2d, v3.4s
> >
> > PR target/113859
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-simd.md (aarch64_addlp):
> Rename to...
> > (@aarch64_addlp): ... This.
> > (popcount2): New define_expand.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/popcnt-vec.c: New test.
> > ---
> >  gcc/config/aarch64/aarch64-simd.md| 28 +++-
> >  gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 65
> > +++
> >  2 files changed, 92 insertions(+), 1 deletion(-)  create mode 100644
> > gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index 0bb39091a38..38dba285f69 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -3461,7 +3461,7 @@ (define_insn
> "*aarch64_addlv_ze"
> >[(set_attr "type" "neon_reduc_add")]
> >  )
> >
> > -(define_expand "aarch64_addlp"
> > +(define_expand "@aarch64_addlp"
> >[(set (match_operand: 0 "register_operand")
> > (plus:
> >   (vec_select:
> > @@ -3517,6 +3517,32 @@ (define_insn
> "popcount2"
> >[(set_attr "type" "neon_cnt")]
> >  )
> >
> > +(define_expand "popcount2"
> > +  [(set (match_operand:VDQHSD 0 "register_operand")
> > +(popcount:VDQHSD (match_operand:VDQHSD 1
> > +"register_operand")))]
> > +  "TARGET_SIMD"
> > +  {
> > +/* Generate a byte popcount. */
> > +machine_mode mode =  == 64 ? V8QImode : V16QImode;
> > +rtx tmp = gen_reg_rtx (mode);
> > +auto icode = optab_handler (popcount_optab, mode);
> > +emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode,
> > +operands[1])));
> > +
> > +/* Use a sequence of UADDLPs to accumulate the counts. Each step
> doubles the
> > +   element size and halves the number of elements. */
> 
> Nit: reflowing this paragraph has made the first line too long.
> I think we should stick with the version in the review:
> 
>/* Use a sequence of UADDLPs to accumulate the counts.  Each step
> doubles
>   the element size and halves the number of elements.  */

Good catch. I've fixed this in the latest version.
https://gcc.gnu.org/pipermail/gcc-patches/2024-June/654601.html

> 
> > +do
> > +  {
> > +auto icode = code_for_aarch64_addlp (ZERO_EXTEND, GET_MODE
> (tmp));
> > +mode = insn_data[icode].operand[0].mode;
> > +rtx dest = mode == mode ? operands[0] : gen_reg_rtx
> (mode);
> > +emit_insn (GEN_FCN (icode) (dest, tmp));
> > +tmp = dest;
> > +  }
> > +while (mode != mode);
> > +DONE;
> > +  }
> > +)
> > +
> >  ;; 'across lanes' max and min ops.
> >
> >  ;; Template for outputting a scalar, so we can create __builtins
> > which can be diff --git
> > a/gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> > b/gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> > new file mode 100644
> > index 000..89860940296
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> > @@ -0,0 +1,65 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +
> > +/* This function should produce cnt v.16b. */ void bar (unsigned char
> > +*__restrict b, unsigned char *__restrict d) {
> > +  for (int i = 0; i < 1024; i++)
> > +d[i] = __builtin_popcount (b[i]); }
> > +
> > +/* This function should produce cnt v

[PATCH v3] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-13 Thread Pengxuan Zheng
This patch improves GCC’s vectorization of __builtin_popcount for aarch64 target
by adding popcount patterns for vector modes besides QImode, i.e., HImode,
SImode and DImode.

With this patch, we now generate the following for V8HI:
  cnt v1.16b, v.16b
  uaddlp  v2.8h, v1.16b

For V4HI, we generate:
  cnt v1.8b, v.8b
  uaddlp  v2.4h, v1.8b

For V4SI, we generate:
  cnt v1.16b, v.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h

For V2SI, we generate:
  cnt v1.8b, v.8b
  uaddlp  v2.4h, v1.8b
  uaddlp  v3.2s, v2.4h

For V2DI, we generate:
  cnt v1.16b, v.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h
  uaddlp  v4.2d, v3.4s

PR target/113859

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (aarch64_addlp): Rename to...
(@aarch64_addlp): ... This.
(popcount2): New define_expand.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt-vec.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-simd.md| 28 +++-
 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 69 +++
 2 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 0bb39091a38..ee73e13534b 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3461,7 +3461,7 @@ (define_insn 
"*aarch64_addlv_ze"
   [(set_attr "type" "neon_reduc_add")]
 )
 
-(define_expand "aarch64_addlp"
+(define_expand "@aarch64_addlp"
   [(set (match_operand: 0 "register_operand")
(plus:
  (vec_select:
@@ -3517,6 +3517,32 @@ (define_insn "popcount2"
   [(set_attr "type" "neon_cnt")]
 )
 
+(define_expand "popcount2"
+  [(set (match_operand:VDQHSD 0 "register_operand")
+(popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
+  "TARGET_SIMD"
+  {
+/* Generate a byte popcount. */
+machine_mode mode =  == 64 ? V8QImode : V16QImode;
+rtx tmp = gen_reg_rtx (mode);
+auto icode = optab_handler (popcount_optab, mode);
+emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode, operands[1])));
+
+/* Use a sequence of UADDLPs to accumulate the counts. Each step doubles
+   the element size and halves the number of elements. */
+do
+  {
+auto icode = code_for_aarch64_addlp (ZERO_EXTEND, GET_MODE (tmp));
+mode = insn_data[icode].operand[0].mode;
+rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
+emit_insn (GEN_FCN (icode) (dest, tmp));
+tmp = dest;
+  }
+while (mode != mode);
+DONE;
+  }
+)
+
 ;; 'across lanes' max and min ops.
 
 ;; Template for outputting a scalar, so we can create __builtins which can be
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-vec.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
new file mode 100644
index 000..0c4926d7ca8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
@@ -0,0 +1,69 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-vect-cost-model" } */
+
+/* This function should produce cnt v.16b. */
+void
+bar (unsigned char *__restrict b, unsigned char *__restrict d)
+{
+  for (int i = 0; i < 1024; i++)
+d[i] = __builtin_popcount (b[i]);
+}
+
+/* This function should produce cnt v.16b and uaddlp (Add Long Pairwise). */
+void
+bar1 (unsigned short *__restrict b, unsigned short *__restrict d)
+{
+  for (int i = 0; i < 1024; i++)
+d[i] = __builtin_popcount (b[i]);
+}
+
+/* This function should produce cnt v.16b and 2 uaddlp (Add Long Pairwise). */
+void
+bar2 (unsigned int *__restrict b, unsigned int *__restrict d)
+{
+  for (int i = 0; i < 1024; i++)
+d[i] = __builtin_popcount (b[i]);
+}
+
+/* This function should produce cnt v.16b and 3 uaddlp (Add Long Pairwise). */
+void
+bar3 (unsigned long long *__restrict b, unsigned long long *__restrict d)
+{
+  for (int i = 0; i < 1024; i++)
+d[i] = __builtin_popcountll (b[i]);
+}
+
+/* SLP
+   This function should produce cnt v.8b and uaddlp (Add Long Pairwise). */
+void
+bar4 (unsigned short *__restrict b, unsigned short *__restrict d)
+{
+  d[0] = __builtin_popcount (b[0]);
+  d[1] = __builtin_popcount (b[1]);
+  d[2] = __builtin_popcount (b[2]);
+  d[3] = __builtin_popcount (b[3]);
+}
+
+/* SLP
+   This function should produce cnt v.8b and 2 uaddlp (Add Long Pairwise). */
+void
+bar5 (unsigned int *__restrict b, unsigned int *__restrict d)
+{
+  d[0] = __builtin_popcount (b[0]);
+  d[1] = __builtin_popcount (b[1]);
+}
+
+/* SLP
+   This function should produce cnt v.16b and 3 uaddlp (Add Long Pairwise). */
+void
+bar6 (unsigned long long *__restrict b, unsigned long long *__restrict d)
+{
+  d[0] = __builtin_popcountll (b[0]);
+  d[1] = __builtin_popcountll (b[1]);
+}
+
+/* 

RE: [PATCH] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-12 Thread Pengxuan Zheng (QUIC)
> Pengxuan Zheng  writes:
> > This patch improves GCC’s vectorization of __builtin_popcount for
> > aarch64 target by adding popcount patterns for vector modes besides
> > QImode, i.e., HImode, SImode and DImode.
> >
> > With this patch, we now generate the following for HImode:
> >   cnt v1.16b, v.16b
> >   uaddlp  v2.8h, v1.16b
> >
> > For SImode, we generate:
> >   cnt v1.16b, v.16b
> >   uaddlp  v2.8h, v1.16b
> >   uaddlp  v3.4s, v2.8h
> >
> > For V2DI, we generate:
> >   cnt v1.16b, v.16b
> >   uaddlp  v2.8h, v1.16b
> >   uaddlp  v3.4s, v2.8h
> >   uaddlp  v4.2d, v3.4s
> >
> > gcc/ChangeLog:
> >
> > PR target/113859
> > * config/aarch64/aarch64-simd.md (popcount2): New
> define_expand.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR target/113859
> > * gcc.target/aarch64/popcnt-vec.c: New test.
> >
> > Signed-off-by: Pengxuan Zheng 
> > ---
> >  gcc/config/aarch64/aarch64-simd.md| 40 
> >  gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 48
> > +++
> >  2 files changed, 88 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index f8bb973a278..093c32ee8ff 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -3540,6 +3540,46 @@ (define_insn
> "popcount2"
> >[(set_attr "type" "neon_cnt")]
> >  )
> >
> > +(define_expand "popcount2"
> > +  [(set (match_operand:VQN 0 "register_operand" "=w")
> > +(popcount:VQN (match_operand:VQN 1 "register_operand" "w")))]
> > +  "TARGET_SIMD"
> > +  {
> > +rtx v = gen_reg_rtx (V16QImode);
> > +rtx v1 = gen_reg_rtx (V16QImode);
> > +emit_move_insn (v, gen_lowpart (V16QImode, operands[1]));
> > +emit_insn (gen_popcountv16qi2 (v1, v));
> > +if (mode == V8HImode)
> > +  {
> > +/* For V8HI, we generate:
> > +cnt v1.16b, v.16b
> > +uaddlp  v2.8h, v1.16b */
> > +emit_insn (gen_aarch64_uaddlpv16qi (operands[0], v1));
> > +DONE;
> > +  }
> > +rtx v2 = gen_reg_rtx (V8HImode);
> > +emit_insn (gen_aarch64_uaddlpv16qi (v2, v1));
> > +if (mode == V4SImode)
> > +  {
> > +/* For V4SI, we generate:
> > +cnt v1.16b, v.16b
> > +uaddlp  v2.8h, v1.16b
> > +uaddlp  v3.4s, v2.8h */
> > +emit_insn (gen_aarch64_uaddlpv8hi (operands[0], v2));
> > +DONE;
> > +  }
> > +/* For V2DI, we generate:
> > +cnt v1.16b, v.16b
> > +uaddlp  v2.8h, v1.16b
> > +uaddlp  v3.4s, v2.8h
> > +uaddlp  v4.2d, v3.4s */
> > +rtx v3 = gen_reg_rtx (V4SImode);
> > +emit_insn (gen_aarch64_uaddlpv8hi (v3, v2));
> > +emit_insn (gen_aarch64_uaddlpv4si (operands[0], v3));
> > +DONE;
> > +  }
> > +)
> > +
> 
> Could you add support for V4HI and V2SI at the same time?

Yes, Richard, and thanks a lot for the example consolidating the handling of 
all 5 modes.

Here's the updated patch along with added tests covering V4HI and V2SI.
https://gcc.gnu.org/pipermail/gcc-patches/2024-June/654429.html

Thanks,
Pengxuan
> 
> I think it's possible to handle all 5 modes iteratively, like so:
> 
> (define_expand "popcount2"
>   [(set (match_operand:VDQHSD 0 "register_operand")
> (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
>   "TARGET_SIMD"
> {
>   /* Generate a byte popcount.  */
>   machine_mode mode =  == 64 ? V8QImode : V16QImode;
>   rtx tmp = gen_reg_rtx (mode);
>   auto icode = optab_handler (popcount_optab, mode);
>   emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode, operands[1])));
> 
>   /* Use a sequence of UADDLPs to accumulate the counts.  Each step doubles
>  the element size and halves the number of elements.  */
>   do
> {
>   auto icode = code_for_aarch64_addlp (ZERO_EXTEND, GET_MODE (tmp));
>   mode = insn_data[icode].operand[0].mode;
>   rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
>   emit_insn (GEN_FCN (icode) (dest, tmp));
>   tmp = dest;
> }
>   while (mode != mode);
>   DONE;
> })
> 
> (only lightly tested).  This requires chan

[PATCH v2] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-12 Thread Pengxuan Zheng
This patch improves GCC’s vectorization of __builtin_popcount for aarch64 target
by adding popcount patterns for vector modes besides QImode, i.e., HImode,
SImode and DImode.

With this patch, we now generate the following for V8HI:
  cnt v1.16b, v.16b
  uaddlp  v2.8h, v1.16b

For V4HI, we generate:
  cnt v1.8b, v.8b
  uaddlp  v2.4h, v1.8b

For V4SI, we generate:
  cnt v1.16b, v.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h

For V2SI, we generate:
  cnt v1.8b, v.8b
  uaddlp  v2.4h, v1.8b
  uaddlp  v3.2s, v2.4h

For V2DI, we generate:
  cnt v1.16b, v.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h
  uaddlp  v4.2d, v3.4s

PR target/113859

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (aarch64_addlp): Rename to...
(@aarch64_addlp): ... This.
(popcount2): New define_expand.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt-vec.c: New test.
---
 gcc/config/aarch64/aarch64-simd.md| 28 +++-
 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 65 +++
 2 files changed, 92 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 0bb39091a38..38dba285f69 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3461,7 +3461,7 @@ (define_insn 
"*aarch64_addlv_ze"
   [(set_attr "type" "neon_reduc_add")]
 )
 
-(define_expand "aarch64_addlp"
+(define_expand "@aarch64_addlp"
   [(set (match_operand: 0 "register_operand")
(plus:
  (vec_select:
@@ -3517,6 +3517,32 @@ (define_insn "popcount2"
   [(set_attr "type" "neon_cnt")]
 )
 
+(define_expand "popcount2"
+  [(set (match_operand:VDQHSD 0 "register_operand")
+(popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
+  "TARGET_SIMD"
+  {
+/* Generate a byte popcount. */
+machine_mode mode =  == 64 ? V8QImode : V16QImode;
+rtx tmp = gen_reg_rtx (mode);
+auto icode = optab_handler (popcount_optab, mode);
+emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode, operands[1])));
+
+/* Use a sequence of UADDLPs to accumulate the counts. Each step doubles 
the
+   element size and halves the number of elements. */
+do
+  {
+auto icode = code_for_aarch64_addlp (ZERO_EXTEND, GET_MODE (tmp));
+mode = insn_data[icode].operand[0].mode;
+rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode);
+emit_insn (GEN_FCN (icode) (dest, tmp));
+tmp = dest;
+  }
+while (mode != mode);
+DONE;
+  }
+)
+
 ;; 'across lanes' max and min ops.
 
 ;; Template for outputting a scalar, so we can create __builtins which can be
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-vec.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
new file mode 100644
index 000..89860940296
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
@@ -0,0 +1,65 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* This function should produce cnt v.16b. */
+void
+bar (unsigned char *__restrict b, unsigned char *__restrict d)
+{
+  for (int i = 0; i < 1024; i++)
+d[i] = __builtin_popcount (b[i]);
+}
+
+/* This function should produce cnt v.16b and uaddlp (Add Long Pairwise). */
+void
+bar1 (unsigned short *__restrict b, unsigned short *__restrict d)
+{
+  for (int i = 0; i < 1024; i++)
+d[i] = __builtin_popcount (b[i]);
+}
+
+/* This function should produce cnt v.16b and 2 uaddlp (Add Long Pairwise). */
+void
+bar2 (unsigned int *__restrict b, unsigned int *__restrict d)
+{
+  for (int i = 0; i < 1024; i++)
+d[i] = __builtin_popcount (b[i]);
+}
+
+/* This function should produce cnt v.16b and 3 uaddlp (Add Long Pairwise). */
+void
+bar3 (unsigned long long *__restrict b, unsigned long long *__restrict d)
+{
+  for (int i = 0; i < 1024; i++)
+d[i] = __builtin_popcountll (b[i]);
+}
+
+/* This function should produce cnt v.8b and uaddlp (Add Long Pairwise). */
+void
+bar4 (unsigned short *__restrict b, unsigned short *__restrict d)
+{
+  for (int i = 0; i < 1028; i++)
+d[i] = __builtin_popcount (b[i]);
+}
+
+/* This function should produce cnt v.8b and 2 uaddlp (Add Long Pairwise). */
+void
+bar5 (unsigned int *__restrict b, unsigned int *__restrict d)
+{
+  for (int i = 0; i < 1026; i++)
+d[i] = __builtin_popcount (b[i]);
+}
+
+/* SLP
+ This function should produce cnt v.16b and 3 uaddlp (Add Long Pairwise). */
+void
+bar6 (unsigned long long *__restrict b, unsigned long long *__restrict d)
+{
+  d[0] = __builtin_popcountll (b[0]);
+  d[1] = __builtin_popcountll (b[1]);
+}
+
+/* { dg-final { scan-assembler-not {\tbl\tpopcount} } } */
+/* { dg-final { scan-assembler-times {cnt\t} 7 } } */
+/* { dg-final { scan-assembler-times {uaddlp\t} 12 } } */
+/* { dg-final { scan-assembler-times {ldr\tq} 5 } } */
+/* { dg-final { scan-assembler-times {ldr\td} 2 } } */
-- 
2.17.1



Ping [PATCH] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-11 Thread Pengxuan Zheng (QUIC)
Ping https://gcc.gnu.org/pipermail/gcc-patches/2024-May/650311.html

> -Original Message-
> From: Pengxuan Zheng (QUIC) 
> Sent: Tuesday, April 30, 2024 5:32 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Andrew Pinski (QUIC) ; Pengxuan Zheng
> (QUIC) 
> Subject: [PATCH] aarch64: Add vector popcount besides QImode [PR113859]
> 
> This patch improves GCC’s vectorization of __builtin_popcount for aarch64
> target by adding popcount patterns for vector modes besides QImode, i.e.,
> HImode, SImode and DImode.
> 
> With this patch, we now generate the following for HImode:
>   cnt v1.16b, v.16b
>   uaddlp  v2.8h, v1.16b
> 
> For SImode, we generate:
>   cnt v1.16b, v.16b
>   uaddlp  v2.8h, v1.16b
>   uaddlp  v3.4s, v2.8h
> 
> For V2DI, we generate:
>   cnt v1.16b, v.16b
>   uaddlp  v2.8h, v1.16b
>   uaddlp  v3.4s, v2.8h
>   uaddlp  v4.2d, v3.4s
> 
> gcc/ChangeLog:
> 
>   PR target/113859
>   * config/aarch64/aarch64-simd.md (popcount2): New
> define_expand.
> 
> gcc/testsuite/ChangeLog:
> 
>   PR target/113859
>   * gcc.target/aarch64/popcnt-vec.c: New test.
> 
> Signed-off-by: Pengxuan Zheng 
> ---
>  gcc/config/aarch64/aarch64-simd.md| 40 
>  gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 48 +++
>  2 files changed, 88 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> index f8bb973a278..093c32ee8ff 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3540,6 +3540,46 @@ (define_insn "popcount2"
>[(set_attr "type" "neon_cnt")]
>  )
> 
> +(define_expand "popcount2"
> +  [(set (match_operand:VQN 0 "register_operand" "=w")
> +(popcount:VQN (match_operand:VQN 1 "register_operand" "w")))]
> +  "TARGET_SIMD"
> +  {
> +rtx v = gen_reg_rtx (V16QImode);
> +rtx v1 = gen_reg_rtx (V16QImode);
> +emit_move_insn (v, gen_lowpart (V16QImode, operands[1]));
> +emit_insn (gen_popcountv16qi2 (v1, v));
> +if (mode == V8HImode)
> +  {
> +/* For V8HI, we generate:
> +cnt v1.16b, v.16b
> +uaddlp  v2.8h, v1.16b */
> +emit_insn (gen_aarch64_uaddlpv16qi (operands[0], v1));
> +DONE;
> +  }
> +rtx v2 = gen_reg_rtx (V8HImode);
> +emit_insn (gen_aarch64_uaddlpv16qi (v2, v1));
> +if (mode == V4SImode)
> +  {
> +/* For V4SI, we generate:
> +cnt v1.16b, v.16b
> +uaddlp  v2.8h, v1.16b
> +uaddlp  v3.4s, v2.8h */
> +emit_insn (gen_aarch64_uaddlpv8hi (operands[0], v2));
> +DONE;
> +  }
> +/* For V2DI, we generate:
> +cnt v1.16b, v.16b
> +uaddlp  v2.8h, v1.16b
> +uaddlp  v3.4s, v2.8h
> +uaddlp  v4.2d, v3.4s */
> +rtx v3 = gen_reg_rtx (V4SImode);
> +emit_insn (gen_aarch64_uaddlpv8hi (v3, v2));
> +emit_insn (gen_aarch64_uaddlpv4si (operands[0], v3));
> +DONE;
> +  }
> +)
> +
>  ;; 'across lanes' max and min ops.
> 
>  ;; Template for outputting a scalar, so we can create __builtins which can be
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> b/gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> new file mode 100644
> index 000..4c9a1b95990
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> @@ -0,0 +1,48 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +/* This function should produce cnt v.16b. */ void bar (unsigned char
> +*__restrict b, unsigned char *__restrict d) {
> +  for (int i = 0; i < 1024; i++)
> +d[i] = __builtin_popcount (b[i]);
> +}
> +
> +/* This function should produce cnt v.16b and uaddlp (Add Long
> +Pairwise). */ void
> +bar1 (unsigned short *__restrict b, unsigned short *__restrict d) {
> +  for (int i = 0; i < 1024; i++)
> +d[i] = __builtin_popcount (b[i]);
> +}
> +
> +/* This function should produce cnt v.16b and 2 uaddlp (Add Long
> +Pairwise). */ void
> +bar2 (unsigned int *__restrict b, unsigned int *__restrict d) {
> +  for (int i = 0; i < 1024; i++)
> +d[i] = __builtin_popcount (b[i]);
> +}
> +
> +/* This function should produce cnt v.16b and 3 uaddlp (Add Long
> +Pairwise). */ void
> +bar3 (unsigned long long *__restrict b, unsigned long long *__restrict
> +d) {
> +  for (int i = 0; i < 1024; i++)
> +d[i] = __builtin_popcountll (b[i]); }
> +
> +/* SLP
> + This function should produce cnt v.16b and 3 uaddlp (Add Long
> +Pairwise). */ void
> +bar4 (unsigned long long *__restrict b, unsigned long long *__restrict
> +d) {
> +  d[0] = __builtin_popcountll (b[0]);
> +  d[1] = __builtin_popcountll (b[1]);
> +}
> +
> +/* { dg-final { scan-assembler-not {\tbl\tpopcount} } } */
> +/* { dg-final { scan-assembler-times {cnt\t} 5 } } */
> +/* { dg-final { scan-assembler-times {uaddlp\t} 9 } } */
> +/* { dg-final { scan-assembler-times {ldr\tq} 5 } } */
> --
> 2.17.1



RE: [PATCH] aarch64: Add vector floating point trunc pattern

2024-06-11 Thread Pengxuan Zheng (QUIC)
> Pengxuan Zheng  writes:
> > This patch is a follow-up of r15-1079-g230d62a2cdd16c to add vector
> > floating point trunc pattern for V2DF->V2SF and V4SF->V4HF conversions
> > by renaming the existing
> > aarch64_float_truncate_lo_ pattern to the standard
> > optab one, i.e., trunc2. This allows the
> vectorizer to vectorize certain floating point narrowing operations for the
> aarch64 target.
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-builtins.cc (VAR1): Remap
> float_truncate_lo_
> > builtin codes to standard optab ones.
> > * config/aarch64/aarch64-simd.md
> (aarch64_float_truncate_lo_):
> > Rename to...
> > (trunc2): ... This.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/trunc-vec.c: New test.
> 
> OK, thanks.
> 
> Richard

Thanks, Richard! Pushed as r15-1182-ge7cd8ea1fa3e48.

Thanks,
Pengxuan
> 
> > Signed-off-by: Pengxuan Zheng 
> > ---
> >  gcc/config/aarch64/aarch64-builtins.cc   |  7 +++
> >  gcc/config/aarch64/aarch64-simd.md   |  6 +++---
> >  gcc/testsuite/gcc.target/aarch64/trunc-vec.c | 21
> > 
> >  3 files changed, 31 insertions(+), 3 deletions(-)  create mode 100644
> > gcc/testsuite/gcc.target/aarch64/trunc-vec.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-builtins.cc
> > b/gcc/config/aarch64/aarch64-builtins.cc
> > index 25189888d17..d589e59defc 100644
> > --- a/gcc/config/aarch64/aarch64-builtins.cc
> > +++ b/gcc/config/aarch64/aarch64-builtins.cc
> > @@ -543,6 +543,13 @@ BUILTIN_VDQ_BHSI (uhadd, uavg, _floor, 0)
> >  VAR1 (float_extend_lo_, extend, v2sf, v2df)
> >  VAR1 (float_extend_lo_, extend, v4hf, v4sf)
> >
> > +/* __builtin_aarch64_float_truncate_lo_ should be expanded
> through the
> > +   standard optabs CODE_FOR_trunc2. */ constexpr
> > +insn_code CODE_FOR_aarch64_float_truncate_lo_v4hf
> > += CODE_FOR_truncv4sfv4hf2;
> > +constexpr insn_code CODE_FOR_aarch64_float_truncate_lo_v2sf
> > += CODE_FOR_truncv2dfv2sf2;
> > +
> >  #undef VAR1
> >  #define VAR1(T, N, MAP, FLAG, A) \
> >{#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T, FLAG_##FLAG}, diff
> > --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index c5e2c9f00d0..f644bd1731e 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -3197,7 +3197,7 @@ (define_expand
> "aarch64_float_trunc_rodd_hi_v4sf"
> >  }
> >  )
> >
> > -(define_insn "aarch64_float_truncate_lo_"
> > +(define_insn "trunc2"
> >[(set (match_operand:VDF 0 "register_operand" "=w")
> >(float_truncate:VDF
> > (match_operand: 1 "register_operand" "w")))] @@ -3256,7
> > +3256,7 @@ (define_expand "vec_pack_trunc_v2df"
> >  int lo = BYTES_BIG_ENDIAN ? 2 : 1;
> >  int hi = BYTES_BIG_ENDIAN ? 1 : 2;
> >
> > -emit_insn (gen_aarch64_float_truncate_lo_v2sf (tmp, operands[lo]));
> > +emit_insn (gen_truncv2dfv2sf2 (tmp, operands[lo]));
> >  emit_insn (gen_aarch64_float_truncate_hi_v4sf (operands[0],
> >tmp, operands[hi]));
> >  DONE;
> > @@ -3272,7 +3272,7 @@ (define_expand "vec_pack_trunc_df"
> >{
> >  rtx tmp = gen_reg_rtx (V2SFmode);
> >  emit_insn (gen_aarch64_vec_concatdf (tmp, operands[1], operands[2]));
> > -emit_insn (gen_aarch64_float_truncate_lo_v2sf (operands[0], tmp));
> > +emit_insn (gen_truncv2dfv2sf2 (operands[0], tmp));
> >  DONE;
> >}
> >  )
> > diff --git a/gcc/testsuite/gcc.target/aarch64/trunc-vec.c
> > b/gcc/testsuite/gcc.target/aarch64/trunc-vec.c
> > new file mode 100644
> > index 000..05e8af7912d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/trunc-vec.c
> > @@ -0,0 +1,21 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +
> > +/* { dg-final { scan-assembler-times {fcvtn\tv[0-9]+.2s, v[0-9]+.2d}
> > +1 } } */ void f (double *__restrict a, float *__restrict b) {
> > +  b[0] = a[0];
> > +  b[1] = a[1];
> > +}
> > +
> > +/* { dg-final { scan-assembler-times {fcvtn\tv[0-9]+.4h, v[0-9]+.4s}
> > +1 } } */ void
> > +f1 (float *__restrict a, _Float16 *__restrict b) {
> > +
> > +  b[0] = a[0];
> > +  b[1] = a[1];
> > +  b[2] = a[2];
> > +  b[3] = a[3];
> > +}


[PATCH] aarch64: Add vector floating point trunc pattern

2024-06-07 Thread Pengxuan Zheng
This patch is a follow-up of r15-1079-g230d62a2cdd16c to add vector floating
point trunc pattern for V2DF->V2SF and V4SF->V4HF conversions by renaming the
existing aarch64_float_truncate_lo_ pattern to the standard
optab one, i.e., trunc2. This allows the vectorizer
to vectorize certain floating point narrowing operations for the aarch64 target.

gcc/ChangeLog:

* config/aarch64/aarch64-builtins.cc (VAR1): Remap float_truncate_lo_
builtin codes to standard optab ones.
* config/aarch64/aarch64-simd.md 
(aarch64_float_truncate_lo_):
Rename to...
(trunc2): ... This.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/trunc-vec.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-builtins.cc   |  7 +++
 gcc/config/aarch64/aarch64-simd.md   |  6 +++---
 gcc/testsuite/gcc.target/aarch64/trunc-vec.c | 21 
 3 files changed, 31 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/trunc-vec.c

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 25189888d17..d589e59defc 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -543,6 +543,13 @@ BUILTIN_VDQ_BHSI (uhadd, uavg, _floor, 0)
 VAR1 (float_extend_lo_, extend, v2sf, v2df)
 VAR1 (float_extend_lo_, extend, v4hf, v4sf)
 
+/* __builtin_aarch64_float_truncate_lo_ should be expanded through the
+   standard optabs CODE_FOR_trunc2. */
+constexpr insn_code CODE_FOR_aarch64_float_truncate_lo_v4hf
+= CODE_FOR_truncv4sfv4hf2;
+constexpr insn_code CODE_FOR_aarch64_float_truncate_lo_v2sf
+= CODE_FOR_truncv2dfv2sf2;
+
 #undef VAR1
 #define VAR1(T, N, MAP, FLAG, A) \
   {#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T, FLAG_##FLAG},
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index c5e2c9f00d0..f644bd1731e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3197,7 +3197,7 @@ (define_expand "aarch64_float_trunc_rodd_hi_v4sf"
 }
 )
 
-(define_insn "aarch64_float_truncate_lo_"
+(define_insn "trunc2"
   [(set (match_operand:VDF 0 "register_operand" "=w")
   (float_truncate:VDF
(match_operand: 1 "register_operand" "w")))]
@@ -3256,7 +3256,7 @@ (define_expand "vec_pack_trunc_v2df"
 int lo = BYTES_BIG_ENDIAN ? 2 : 1;
 int hi = BYTES_BIG_ENDIAN ? 1 : 2;
 
-emit_insn (gen_aarch64_float_truncate_lo_v2sf (tmp, operands[lo]));
+emit_insn (gen_truncv2dfv2sf2 (tmp, operands[lo]));
 emit_insn (gen_aarch64_float_truncate_hi_v4sf (operands[0],
   tmp, operands[hi]));
 DONE;
@@ -3272,7 +3272,7 @@ (define_expand "vec_pack_trunc_df"
   {
 rtx tmp = gen_reg_rtx (V2SFmode);
 emit_insn (gen_aarch64_vec_concatdf (tmp, operands[1], operands[2]));
-emit_insn (gen_aarch64_float_truncate_lo_v2sf (operands[0], tmp));
+emit_insn (gen_truncv2dfv2sf2 (operands[0], tmp));
 DONE;
   }
 )
diff --git a/gcc/testsuite/gcc.target/aarch64/trunc-vec.c 
b/gcc/testsuite/gcc.target/aarch64/trunc-vec.c
new file mode 100644
index 000..05e8af7912d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/trunc-vec.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* { dg-final { scan-assembler-times {fcvtn\tv[0-9]+.2s, v[0-9]+.2d} 1 } } */
+void
+f (double *__restrict a, float *__restrict b)
+{
+  b[0] = a[0];
+  b[1] = a[1];
+}
+
+/* { dg-final { scan-assembler-times {fcvtn\tv[0-9]+.4h, v[0-9]+.4s} 1 } } */
+void
+f1 (float *__restrict a, _Float16 *__restrict b)
+{
+
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+}
-- 
2.17.1



RE: [PATCH v2] aarch64: Add vector floating point extend pattern [PR113880, PR113869]

2024-06-06 Thread Pengxuan Zheng (QUIC)
> Pengxuan Zheng  writes:
> > This patch adds vector floating point extend pattern for V2SF->V2DF
> > and
> > V4HF->V4SF conversions by renaming the existing
> > V4HF->aarch64_float_extend_lo_
> > pattern to the standard optab one, i.e., extend2. This
> > allows the vectorizer to vectorize certain floating point widening
> > operations for the
> > aarch64 target.
> >
> > PR target/113880
> > PR target/113869
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-builtins.cc (VAR1): Remap float_extend_lo_
> > builtin codes to standard optab ones.
> > * config/aarch64/aarch64-simd.md
> (aarch64_float_extend_lo_): Rename
> > to...
> > (extend2): ... This.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/extend-vec.c: New test.
> 
> OK, thanks, and sorry for the slow review.
> 
> Richard

Thanks, Richard! Pushed as r15-1079-g230d62a2cdd16c.

Thanks,
Pengxuan
> 
> > Signed-off-by: Pengxuan Zheng 
> > ---
> >  gcc/config/aarch64/aarch64-builtins.cc|  9 
> >  gcc/config/aarch64/aarch64-simd.md|  2 +-
> >  gcc/testsuite/gcc.target/aarch64/extend-vec.c | 21
> > +++
> >  3 files changed, 31 insertions(+), 1 deletion(-)  create mode 100644
> > gcc/testsuite/gcc.target/aarch64/extend-vec.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-builtins.cc
> > b/gcc/config/aarch64/aarch64-builtins.cc
> > index f8eeccb554d..25189888d17 100644
> > --- a/gcc/config/aarch64/aarch64-builtins.cc
> > +++ b/gcc/config/aarch64/aarch64-builtins.cc
> > @@ -534,6 +534,15 @@ BUILTIN_VDQ_BHSI (urhadd, uavg, _ceil, 0)
> > BUILTIN_VDQ_BHSI (shadd, avg, _floor, 0)  BUILTIN_VDQ_BHSI (uhadd,
> > uavg, _floor, 0)
> >
> > +/* The builtins below should be expanded through the standard optabs
> > +   CODE_FOR_extend2. */
> > +#undef VAR1
> > +#define VAR1(F,T,N,M) \
> > +  constexpr insn_code CODE_FOR_aarch64_##F##M =
> > +CODE_FOR_##T##N##M##2;
> > +
> > +VAR1 (float_extend_lo_, extend, v2sf, v2df)
> > +VAR1 (float_extend_lo_, extend, v4hf, v4sf)
> > +
> >  #undef VAR1
> >  #define VAR1(T, N, MAP, FLAG, A) \
> >{#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T, FLAG_##FLAG}, diff
> > --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index 868f4486218..c5e2c9f00d0 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -3132,7 +3132,7 @@
> >  DONE;
> >}
> >  )
> > -(define_insn "aarch64_float_extend_lo_"
> > +(define_insn "extend2"
> >[(set (match_operand: 0 "register_operand" "=w")
> > (float_extend:
> >   (match_operand:VDF 1 "register_operand" "w")))] diff --git
> > a/gcc/testsuite/gcc.target/aarch64/extend-vec.c
> > b/gcc/testsuite/gcc.target/aarch64/extend-vec.c
> > new file mode 100644
> > index 000..f6241d5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/extend-vec.c
> > @@ -0,0 +1,21 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +
> > +/* { dg-final { scan-assembler-times {fcvtl\tv[0-9]+.2d, v[0-9]+.2s}
> > +1 } } */ void f (float *__restrict a, double *__restrict b) {
> > +  b[0] = a[0];
> > +  b[1] = a[1];
> > +}
> > +
> > +/* { dg-final { scan-assembler-times {fcvtl\tv[0-9]+.4s, v[0-9]+.4h}
> > +1 } } */ void
> > +f1 (_Float16 *__restrict a, float *__restrict b) {
> > +
> > +  b[0] = a[0];
> > +  b[1] = a[1];
> > +  b[2] = a[2];
> > +  b[3] = a[3];
> > +}


[PATCH] aarch64: Add fix_truncv4sfv4hi2 pattern [PR113882]

2024-06-03 Thread Pengxuan Zheng
This patch adds the fix_truncv4sfv4hi2 (V4SF->V4HI) pattern which is implemented
using fix_truncv4sfv4si2 (V4SF->V4SI) and then truncv4siv4hi2 (V4SI->V4HI).

PR target/113882

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (fix_truncv4sfv4hi2): New pattern.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/fix_trunc2.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-simd.md| 13 +
 gcc/testsuite/gcc.target/aarch64/fix_trunc2.c | 14 ++
 2 files changed, 27 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/fix_trunc2.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 868f4486218..096f7b56a27 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3032,6 +3032,19 @@ (define_expand 
"2"
   "TARGET_SIMD"
   {})
 
+
+(define_expand "fix_truncv4sfv4hi2"
+  [(match_operand:V4HI 0 "register_operand")
+   (match_operand:V4SF 1 "register_operand")]
+  "TARGET_SIMD"
+  {
+rtx tmp = gen_reg_rtx (V4SImode);
+emit_insn (gen_fix_truncv4sfv4si2 (tmp, operands[1]));
+emit_insn (gen_truncv4siv4hi2 (operands[0], tmp));
+DONE;
+  }
+)
+
 (define_expand "ftrunc2"
   [(set (match_operand:VHSDF 0 "register_operand")
(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")]
diff --git a/gcc/testsuite/gcc.target/aarch64/fix_trunc2.c 
b/gcc/testsuite/gcc.target/aarch64/fix_trunc2.c
new file mode 100644
index 000..57cc00913a3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fix_trunc2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+void
+f (short *__restrict a, float *__restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
+
+/* { dg-final { scan-assembler-times {fcvtzs\tv[0-9]+.4s, v[0-9]+.4s} 1 } } */
+/* { dg-final { scan-assembler-times {xtn\tv[0-9]+.4h, v[0-9]+.4s} 1 } } */
-- 
2.17.1



Ping [PATCH] aarch64: Add vector popcount besides QImode [PR113859]

2024-06-02 Thread Pengxuan Zheng (QUIC)
Ping

> -Original Message-
> From: Pengxuan Zheng (QUIC) 
> Sent: Tuesday, April 30, 2024 5:32 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Andrew Pinski (QUIC) ; Pengxuan Zheng
> (QUIC) 
> Subject: [PATCH] aarch64: Add vector popcount besides QImode [PR113859]
> 
> This patch improves GCC’s vectorization of __builtin_popcount for aarch64
> target by adding popcount patterns for vector modes besides QImode, i.e.,
> HImode, SImode and DImode.
> 
> With this patch, we now generate the following for HImode:
>   cnt v1.16b, v.16b
>   uaddlp  v2.8h, v1.16b
> 
> For SImode, we generate:
>   cnt v1.16b, v.16b
>   uaddlp  v2.8h, v1.16b
>   uaddlp  v3.4s, v2.8h
> 
> For V2DI, we generate:
>   cnt v1.16b, v.16b
>   uaddlp  v2.8h, v1.16b
>   uaddlp  v3.4s, v2.8h
>   uaddlp  v4.2d, v3.4s
> 
> gcc/ChangeLog:
> 
>   PR target/113859
>   * config/aarch64/aarch64-simd.md (popcount2): New
> define_expand.
> 
> gcc/testsuite/ChangeLog:
> 
>   PR target/113859
>   * gcc.target/aarch64/popcnt-vec.c: New test.
> 
> Signed-off-by: Pengxuan Zheng 
> ---
>  gcc/config/aarch64/aarch64-simd.md| 40 
>  gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 48 +++
>  2 files changed, 88 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> index f8bb973a278..093c32ee8ff 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3540,6 +3540,46 @@ (define_insn "popcount2"
>[(set_attr "type" "neon_cnt")]
>  )
> 
> +(define_expand "popcount2"
> +  [(set (match_operand:VQN 0 "register_operand" "=w")
> +(popcount:VQN (match_operand:VQN 1 "register_operand" "w")))]
> +  "TARGET_SIMD"
> +  {
> +rtx v = gen_reg_rtx (V16QImode);
> +rtx v1 = gen_reg_rtx (V16QImode);
> +emit_move_insn (v, gen_lowpart (V16QImode, operands[1]));
> +emit_insn (gen_popcountv16qi2 (v1, v));
> +if (mode == V8HImode)
> +  {
> +/* For V8HI, we generate:
> +cnt v1.16b, v.16b
> +uaddlp  v2.8h, v1.16b */
> +emit_insn (gen_aarch64_uaddlpv16qi (operands[0], v1));
> +DONE;
> +  }
> +rtx v2 = gen_reg_rtx (V8HImode);
> +emit_insn (gen_aarch64_uaddlpv16qi (v2, v1));
> +if (mode == V4SImode)
> +  {
> +/* For V4SI, we generate:
> +cnt v1.16b, v.16b
> +uaddlp  v2.8h, v1.16b
> +uaddlp  v3.4s, v2.8h */
> +emit_insn (gen_aarch64_uaddlpv8hi (operands[0], v2));
> +DONE;
> +  }
> +/* For V2DI, we generate:
> +cnt v1.16b, v.16b
> +uaddlp  v2.8h, v1.16b
> +uaddlp  v3.4s, v2.8h
> +uaddlp  v4.2d, v3.4s */
> +rtx v3 = gen_reg_rtx (V4SImode);
> +emit_insn (gen_aarch64_uaddlpv8hi (v3, v2));
> +emit_insn (gen_aarch64_uaddlpv4si (operands[0], v3));
> +DONE;
> +  }
> +)
> +
>  ;; 'across lanes' max and min ops.
> 
>  ;; Template for outputting a scalar, so we can create __builtins which can be
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> b/gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> new file mode 100644
> index 000..4c9a1b95990
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
> @@ -0,0 +1,48 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +/* This function should produce cnt v.16b. */ void bar (unsigned char
> +*__restrict b, unsigned char *__restrict d) {
> +  for (int i = 0; i < 1024; i++)
> +d[i] = __builtin_popcount (b[i]);
> +}
> +
> +/* This function should produce cnt v.16b and uaddlp (Add Long
> +Pairwise). */ void
> +bar1 (unsigned short *__restrict b, unsigned short *__restrict d) {
> +  for (int i = 0; i < 1024; i++)
> +d[i] = __builtin_popcount (b[i]);
> +}
> +
> +/* This function should produce cnt v.16b and 2 uaddlp (Add Long
> +Pairwise). */ void
> +bar2 (unsigned int *__restrict b, unsigned int *__restrict d) {
> +  for (int i = 0; i < 1024; i++)
> +d[i] = __builtin_popcount (b[i]);
> +}
> +
> +/* This function should produce cnt v.16b and 3 uaddlp (Add Long
> +Pairwise). */ void
> +bar3 (unsigned long long *__restrict b, unsigned long long *__restrict
> +d) {
> +  for (int i = 0; i < 1024; i++)
> +d[i] = __builtin_popcountll (b[i]); }
> +
> +/* SLP
> + This function should produce cnt v.16b and 3 uaddlp (Add Long
> +Pairwise). */ void
> +bar4 (unsigned long long *__restrict b, unsigned long long *__restrict
> +d) {
> +  d[0] = __builtin_popcountll (b[0]);
> +  d[1] = __builtin_popcountll (b[1]);
> +}
> +
> +/* { dg-final { scan-assembler-not {\tbl\tpopcount} } } */
> +/* { dg-final { scan-assembler-times {cnt\t} 5 } } */
> +/* { dg-final { scan-assembler-times {uaddlp\t} 9 } } */
> +/* { dg-final { scan-assembler-times {ldr\tq} 5 } } */
> --
> 2.17.1



RE: [PATCH] aarch64: testsuite: Explicitly add -mlittle-endian to vget_low_2.c

2024-05-31 Thread Pengxuan Zheng (QUIC)
> > Pengxuan Zheng  writes:
> > > vget_low_2.c is a test case for little-endian, but we missed the
> > > -mlittle-endian flag in r15-697-ga2e4fe5a53cf75.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >   * gcc.target/aarch64/vget_low_2.c: Add -mlittle-endian.
> >
> > Ok, thanks.
> >
> > If you'd like write access, please follow the instructions on
> > https://gcc.gnu.org/gitwrite.html (I'll sponsor).
> >
> > Richard
> 
> Thanks a lot, Richard! I really appreciate it!
> 
> I have submitted a request for write access naming you as sponsor.
> 
> Thanks,
> Pengxuan

Thanks, Richard! I've been granted write access now and committed
the patch as r15-950-g7fb62627cfb3e0.
> >
> > > Signed-off-by: Pengxuan Zheng 
> > > ---
> > >  gcc/testsuite/gcc.target/aarch64/vget_low_2.c | 2 +-
> > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > >
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> > > b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> > > index 44414e1c043..93e9e664ee9 100644
> > > --- a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> > > +++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> > > @@ -1,5 +1,5 @@
> > >  /* { dg-do compile } */
> > > -/* { dg-options "-O3 -fdump-tree-optimized" } */
> > > +/* { dg-options "-O3 -fdump-tree-optimized -mlittle-endian" } */
> > >
> > >  #include 


MAINTAINERS: Add myself to Write After Approval and DCO

2024-05-31 Thread Pengxuan Zheng
ChangeLog:

* MAINTAINERS: Add myself to Write After Approval and DCO.

Signed-off-by: Pengxuan Zheng 
---
 MAINTAINERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index e2870eef2ef..6444e6ea2f1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -743,6 +743,7 @@ Dennis Zhang

 Yufeng Zhang   
 Qing Zhao  
 Shujing Zhao   
+Pengxuan Zheng 
 Jon Ziegler
 Roman Zippel   
 Josef Zlomek   
@@ -789,3 +790,4 @@ Martin Uecker   

 Jonathan Wakely
 Alexander Westbrooks   
 Chung-Ju Wu
+Pengxuan Zheng 
-- 
2.17.1



RE: [PATCH] aarch64: Add vector floating point extend patterns [PR113880, PR113869]

2024-05-30 Thread Pengxuan Zheng (QUIC)
> Pengxuan Zheng  writes:
> > This patch improves vectorization of certain floating point widening
> > operations for the aarch64 target by adding vector floating point
> > extend patterns for
> > V2SF->V2DF and V4HF->V4SF conversions.
> >
> > PR target/113880
> > PR target/113869
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-simd.md (extend2): New
> expand.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/extend-vec.c: New test.
> >
> > Signed-off-by: Pengxuan Zheng 
> 
> Thanks for doing this.  Could we instead rename
> aarch64_float_extend_lo_ to extend2 and use
> something similar to:
> 
> ---
> /* The builtins below should be expanded through the standard optabs
>CODE_FOR_[u]avg3_[floor,ceil].  However the mapping scheme in
>aarch64-simd-builtins.def does not easily allow us to have a pre-mode
>("uavg") and post-mode string ("_ceil") in the CODE_FOR_* construction.
>So the builtins use a name that is natural for AArch64 instructions
>e.g. "aarch64_srhadd" and we re-map these to the optab-related
>CODE_FOR_ here.  */
> #undef VAR1
> #define VAR1(F,T1,T2,I,M) \
> constexpr insn_code CODE_FOR_aarch64_##F##M =
> CODE_FOR_##T1##M##3##T2;
> 
> BUILTIN_VDQ_BHSI (srhadd, avg, _ceil, 0) BUILTIN_VDQ_BHSI (urhadd, uavg,
> _ceil, 0) BUILTIN_VDQ_BHSI (shadd, avg, _floor, 0) BUILTIN_VDQ_BHSI
> (uhadd, uavg, _floor, 0)
> 
> #undef VAR1
> ---
> 
> (from aarch64-builtins.cc) to handle the intrinsics?  The idea is to try to 
> avoid
> adding new patterns just to satisfy the internal naming convention.

Sure, Richard.

Here's the updated patch 
https://gcc.gnu.org/pipermail/gcc-patches/2024-May/653177.html.

Please let me know if I missed anything.

Thanks,
Pengxuan
> 
> Richard
> 
> > ---
> >  gcc/config/aarch64/aarch64-simd.md|  7 +++
> >  gcc/testsuite/gcc.target/aarch64/extend-vec.c | 21
> > +++
> >  2 files changed, 28 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/extend-vec.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index 868f4486218..8febb411d06 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -3141,6 +3141,13 @@ (define_insn
> "aarch64_float_extend_lo_"
> >[(set_attr "type" "neon_fp_cvt_widen_s")]
> >  )
> >
> > +(define_expand "extend2"
> > +  [(set (match_operand: 0 "register_operand" "=w")
> > +(float_extend:
> > +  (match_operand:VDF 1 "register_operand" "w")))]
> > +  "TARGET_SIMD"
> > +)
> > +
> >  ;; Float narrowing operations.
> >
> >  (define_insn "aarch64_float_trunc_rodd_df"
> > diff --git a/gcc/testsuite/gcc.target/aarch64/extend-vec.c
> > b/gcc/testsuite/gcc.target/aarch64/extend-vec.c
> > new file mode 100644
> > index 000..f6241d5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/extend-vec.c
> > @@ -0,0 +1,21 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +
> > +/* { dg-final { scan-assembler-times {fcvtl\tv[0-9]+.2d, v[0-9]+.2s}
> > +1 } } */ void f (float *__restrict a, double *__restrict b) {
> > +  b[0] = a[0];
> > +  b[1] = a[1];
> > +}
> > +
> > +/* { dg-final { scan-assembler-times {fcvtl\tv[0-9]+.4s, v[0-9]+.4h}
> > +1 } } */ void
> > +f1 (_Float16 *__restrict a, float *__restrict b) {
> > +
> > +  b[0] = a[0];
> > +  b[1] = a[1];
> > +  b[2] = a[2];
> > +  b[3] = a[3];
> > +}


[PATCH v2] aarch64: Add vector floating point extend pattern [PR113880, PR113869]

2024-05-30 Thread Pengxuan Zheng
This patch adds vector floating point extend pattern for V2SF->V2DF and
V4HF->V4SF conversions by renaming the existing aarch64_float_extend_lo_
pattern to the standard optab one, i.e., extend2. This allows the
vectorizer to vectorize certain floating point widening operations for the
aarch64 target.

PR target/113880
PR target/113869

gcc/ChangeLog:

* config/aarch64/aarch64-builtins.cc (VAR1): Remap float_extend_lo_
builtin codes to standard optab ones.
* config/aarch64/aarch64-simd.md (aarch64_float_extend_lo_): 
Rename
to...
(extend2): ... This.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/extend-vec.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-builtins.cc|  9 
 gcc/config/aarch64/aarch64-simd.md|  2 +-
 gcc/testsuite/gcc.target/aarch64/extend-vec.c | 21 +++
 3 files changed, 31 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/extend-vec.c

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index f8eeccb554d..25189888d17 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -534,6 +534,15 @@ BUILTIN_VDQ_BHSI (urhadd, uavg, _ceil, 0)
 BUILTIN_VDQ_BHSI (shadd, avg, _floor, 0)
 BUILTIN_VDQ_BHSI (uhadd, uavg, _floor, 0)
 
+/* The builtins below should be expanded through the standard optabs
+   CODE_FOR_extend2. */
+#undef VAR1
+#define VAR1(F,T,N,M) \
+  constexpr insn_code CODE_FOR_aarch64_##F##M = CODE_FOR_##T##N##M##2;
+
+VAR1 (float_extend_lo_, extend, v2sf, v2df)
+VAR1 (float_extend_lo_, extend, v4hf, v4sf)
+
 #undef VAR1
 #define VAR1(T, N, MAP, FLAG, A) \
   {#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T, FLAG_##FLAG},
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 868f4486218..c5e2c9f00d0 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3132,7 +3132,7 @@
 DONE;
   }
 )
-(define_insn "aarch64_float_extend_lo_"
+(define_insn "extend2"
   [(set (match_operand: 0 "register_operand" "=w")
(float_extend:
  (match_operand:VDF 1 "register_operand" "w")))]
diff --git a/gcc/testsuite/gcc.target/aarch64/extend-vec.c 
b/gcc/testsuite/gcc.target/aarch64/extend-vec.c
new file mode 100644
index 000..f6241d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/extend-vec.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* { dg-final { scan-assembler-times {fcvtl\tv[0-9]+.2d, v[0-9]+.2s} 1 } } */
+void
+f (float *__restrict a, double *__restrict b)
+{
+  b[0] = a[0];
+  b[1] = a[1];
+}
+
+/* { dg-final { scan-assembler-times {fcvtl\tv[0-9]+.4s, v[0-9]+.4h} 1 } } */
+void
+f1 (_Float16 *__restrict a, float *__restrict b)
+{
+
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+}
-- 
2.17.1



RE: [PATCH] aarch64: testsuite: Explicitly add -mlittle-endian to vget_low_2.c

2024-05-30 Thread Pengxuan Zheng (QUIC)
> Pengxuan Zheng  writes:
> > vget_low_2.c is a test case for little-endian, but we missed the
> > -mlittle-endian flag in r15-697-ga2e4fe5a53cf75.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/vget_low_2.c: Add -mlittle-endian.
> 
> Ok, thanks.
> 
> If you'd like write access, please follow the instructions on
> https://gcc.gnu.org/gitwrite.html (I'll sponsor).
> 
> Richard

Thanks a lot, Richard! I really appreciate it!

I have submitted a request for write access naming you as sponsor.

Thanks,
Pengxuan
> 
> > Signed-off-by: Pengxuan Zheng 
> > ---
> >  gcc/testsuite/gcc.target/aarch64/vget_low_2.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> > b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> > index 44414e1c043..93e9e664ee9 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> > @@ -1,5 +1,5 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-O3 -fdump-tree-optimized" } */
> > +/* { dg-options "-O3 -fdump-tree-optimized -mlittle-endian" } */
> >
> >  #include 


[PATCH] aarch64: Add vector floating point extend patterns [PR113880, PR113869]

2024-05-29 Thread Pengxuan Zheng
This patch improves vectorization of certain floating point widening operations
for the aarch64 target by adding vector floating point extend patterns for
V2SF->V2DF and V4HF->V4SF conversions.

PR target/113880
PR target/113869

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (extend2): New expand.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/extend-vec.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-simd.md|  7 +++
 gcc/testsuite/gcc.target/aarch64/extend-vec.c | 21 +++
 2 files changed, 28 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/extend-vec.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 868f4486218..8febb411d06 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3141,6 +3141,13 @@ (define_insn "aarch64_float_extend_lo_"
   [(set_attr "type" "neon_fp_cvt_widen_s")]
 )
 
+(define_expand "extend2"
+  [(set (match_operand: 0 "register_operand" "=w")
+(float_extend:
+  (match_operand:VDF 1 "register_operand" "w")))]
+  "TARGET_SIMD"
+)
+
 ;; Float narrowing operations.
 
 (define_insn "aarch64_float_trunc_rodd_df"
diff --git a/gcc/testsuite/gcc.target/aarch64/extend-vec.c 
b/gcc/testsuite/gcc.target/aarch64/extend-vec.c
new file mode 100644
index 000..f6241d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/extend-vec.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* { dg-final { scan-assembler-times {fcvtl\tv[0-9]+.2d, v[0-9]+.2s} 1 } } */
+void
+f (float *__restrict a, double *__restrict b)
+{
+  b[0] = a[0];
+  b[1] = a[1];
+}
+
+/* { dg-final { scan-assembler-times {fcvtl\tv[0-9]+.4s, v[0-9]+.4h} 1 } } */
+void
+f1 (_Float16 *__restrict a, float *__restrict b)
+{
+
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+}
-- 
2.17.1



[PATCH] aarch64: testsuite: Explicitly add -mlittle-endian to vget_low_2.c

2024-05-22 Thread Pengxuan Zheng
vget_low_2.c is a test case for little-endian, but we missed the -mlittle-endian
flag in r15-697-ga2e4fe5a53cf75.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vget_low_2.c: Add -mlittle-endian.

Signed-off-by: Pengxuan Zheng 
---
 gcc/testsuite/gcc.target/aarch64/vget_low_2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c 
b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
index 44414e1c043..93e9e664ee9 100644
--- a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O3 -fdump-tree-optimized" } */
+/* { dg-options "-O3 -fdump-tree-optimized -mlittle-endian" } */
 
 #include 
 
-- 
2.17.1



[PATCH] aarch64: Fold vget_high_* intrinsics to BIT_FIELD_REF [PR102171]

2024-05-21 Thread Pengxuan Zheng
This patch is a follow-up of r15-697-ga2e4fe5a53cf75 to also fold vget_high_*
intrinsics to BIT_FILED_REF and remove the vget_high_* definitions from
arm_neon.h to use the new intrinsics framework.

PR target/102171

gcc/ChangeLog:

* config/aarch64/aarch64-builtins.cc (AARCH64_SIMD_VGET_HIGH_BUILTINS):
New macro to create definitions for all vget_high intrinsics.
(VGET_HIGH_BUILTIN): Likewise.
(enum aarch64_builtins): Add vget_high function codes.
(AARCH64_SIMD_VGET_LOW_BUILTINS): Delete duplicate macro.
(aarch64_general_fold_builtin): Fold vget_high calls.
* config/aarch64/aarch64-simd-builtins.def: Delete vget_high builtins.
* config/aarch64/aarch64-simd.md (aarch64_get_high): Delete.
(aarch64_vget_hi_halfv8bf): Likewise.
* config/aarch64/arm_neon.h (__attribute__): Delete.
(vget_high_f16): Likewise.
(vget_high_f32): Likewise.
(vget_high_f64): Likewise.
(vget_high_p8): Likewise.
(vget_high_p16): Likewise.
(vget_high_p64): Likewise.
(vget_high_s8): Likewise.
(vget_high_s16): Likewise.
(vget_high_s32): Likewise.
(vget_high_s64): Likewise.
(vget_high_u8): Likewise.
(vget_high_u16): Likewise.
(vget_high_u32): Likewise.
(vget_high_u64): Likewise.
(vget_high_bf16): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vget_high_2.c: New test.
* gcc.target/aarch64/vget_high_2_be.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-builtins.cc|  59 +++---
 gcc/config/aarch64/aarch64-simd-builtins.def  |   6 -
 gcc/config/aarch64/aarch64-simd.md|  22 
 gcc/config/aarch64/arm_neon.h | 105 --
 .../gcc.target/aarch64/vget_high_2.c  |  30 +
 .../gcc.target/aarch64/vget_high_2_be.c   |  31 ++
 6 files changed, 104 insertions(+), 149 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vget_high_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vget_high_2_be.c

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 11b888016ed..f8eeccb554d 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -675,6 +675,23 @@ static aarch64_simd_builtin_datum 
aarch64_simd_builtin_data[] = {
   VGET_LOW_BUILTIN(u64) \
   VGET_LOW_BUILTIN(bf16)
 
+#define AARCH64_SIMD_VGET_HIGH_BUILTINS \
+  VGET_HIGH_BUILTIN(f16) \
+  VGET_HIGH_BUILTIN(f32) \
+  VGET_HIGH_BUILTIN(f64) \
+  VGET_HIGH_BUILTIN(p8) \
+  VGET_HIGH_BUILTIN(p16) \
+  VGET_HIGH_BUILTIN(p64) \
+  VGET_HIGH_BUILTIN(s8) \
+  VGET_HIGH_BUILTIN(s16) \
+  VGET_HIGH_BUILTIN(s32) \
+  VGET_HIGH_BUILTIN(s64) \
+  VGET_HIGH_BUILTIN(u8) \
+  VGET_HIGH_BUILTIN(u16) \
+  VGET_HIGH_BUILTIN(u32) \
+  VGET_HIGH_BUILTIN(u64) \
+  VGET_HIGH_BUILTIN(bf16)
+
 typedef struct
 {
   const char *name;
@@ -717,6 +734,9 @@ typedef struct
 #define VGET_LOW_BUILTIN(A) \
   AARCH64_SIMD_BUILTIN_VGET_LOW_##A,
 
+#define VGET_HIGH_BUILTIN(A) \
+  AARCH64_SIMD_BUILTIN_VGET_HIGH_##A,
+
 #undef VAR1
 #define VAR1(T, N, MAP, FLAG, A) \
   AARCH64_SIMD_BUILTIN_##T##_##N##A,
@@ -753,6 +773,7 @@ enum aarch64_builtins
   /* SIMD intrinsic builtins.  */
   AARCH64_SIMD_VREINTERPRET_BUILTINS
   AARCH64_SIMD_VGET_LOW_BUILTINS
+  AARCH64_SIMD_VGET_HIGH_BUILTINS
   /* ARMv8.3-A Pointer Authentication Builtins.  */
   AARCH64_PAUTH_BUILTIN_AUTIA1716,
   AARCH64_PAUTH_BUILTIN_PACIA1716,
@@ -855,26 +876,21 @@ static aarch64_fcmla_laneq_builtin_datum 
aarch64_fcmla_lane_builtin_data[] = {
false \
   },
 
-#define AARCH64_SIMD_VGET_LOW_BUILTINS \
-  VGET_LOW_BUILTIN(f16) \
-  VGET_LOW_BUILTIN(f32) \
-  VGET_LOW_BUILTIN(f64) \
-  VGET_LOW_BUILTIN(p8) \
-  VGET_LOW_BUILTIN(p16) \
-  VGET_LOW_BUILTIN(p64) \
-  VGET_LOW_BUILTIN(s8) \
-  VGET_LOW_BUILTIN(s16) \
-  VGET_LOW_BUILTIN(s32) \
-  VGET_LOW_BUILTIN(s64) \
-  VGET_LOW_BUILTIN(u8) \
-  VGET_LOW_BUILTIN(u16) \
-  VGET_LOW_BUILTIN(u32) \
-  VGET_LOW_BUILTIN(u64) \
-  VGET_LOW_BUILTIN(bf16)
+#undef VGET_HIGH_BUILTIN
+#define VGET_HIGH_BUILTIN(A) \
+  {"vget_high_" #A, \
+   AARCH64_SIMD_BUILTIN_VGET_HIGH_##A, \
+   2, \
+   { SIMD_INTR_MODE(A, d), SIMD_INTR_MODE(A, q) }, \
+   { SIMD_INTR_QUAL(A), SIMD_INTR_QUAL(A) }, \
+   FLAG_AUTO_FP, \
+   false \
+  },
 
 static const aarch64_simd_intrinsic_datum aarch64_simd_intrinsic_data[] = {
   AARCH64_SIMD_VREINTERPRET_BUILTINS
   AARCH64_SIMD_VGET_LOW_BUILTINS
+  AARCH64_SIMD_VGET_HIGH_BUILTINS
 };
 
 
@@ -3270,6 +3286,10 @@ aarch64_fold_builtin_lane_check (tree arg0, tree arg1, 
tree arg2)
 #define VGET_LOW_BUILTIN(A) \
   case AARCH64_SIMD_BUILTIN_VGET_LOW_##A:
 
+#undef VGET_HIGH_BUILTIN
+#define VGET_HIGH_BUILTIN(A) \
+  case AARCH64_SIMD_BUILTIN_VGET_HIGH_##A:
+
 /* Try to fold a call to the built-in function with subcode FCODE.  The
function is 

RE: [PATCH] aarch64: Fold vget_low_* intrinsics to BIT_FIELD_REF [PR102171]

2024-05-20 Thread Pengxuan Zheng (QUIC)
> On Mon, May 20, 2024 at 2:57 AM Richard Sandiford
>  wrote:
> >
> > Pengxuan Zheng  writes:
> > > This patch folds vget_low_* intrinsics to BIT_FILED_REF to open up
> > > more optimization opportunities for gimple optimizers.
> > >
> > > While we are here, we also remove the vget_low_* definitions from
> > > arm_neon.h and use the new intrinsics framework.
> > >
> > > PR target/102171
> > >
> > > gcc/ChangeLog:
> > >
> > >   * config/aarch64/aarch64-builtins.cc
> (AARCH64_SIMD_VGET_LOW_BUILTINS):
> > >   New macro to create definitions for all vget_low intrinsics.
> > >   (VGET_LOW_BUILTIN): Likewise.
> > >   (enum aarch64_builtins): Add vget_low function codes.
> > >   (aarch64_general_fold_builtin): Fold vget_low calls.
> > >   * config/aarch64/aarch64-simd-builtins.def: Delete vget_low 
> > > builtins.
> > >   * config/aarch64/aarch64-simd.md (aarch64_get_low): Delete.
> > >   (aarch64_vget_lo_halfv8bf): Likewise.
> > >   * config/aarch64/arm_neon.h (__attribute__): Delete.
> > >   (vget_low_f16): Likewise.
> > >   (vget_low_f32): Likewise.
> > >   (vget_low_f64): Likewise.
> > >   (vget_low_p8): Likewise.
> > >   (vget_low_p16): Likewise.
> > >   (vget_low_p64): Likewise.
> > >   (vget_low_s8): Likewise.
> > >   (vget_low_s16): Likewise.
> > >   (vget_low_s32): Likewise.
> > >   (vget_low_s64): Likewise.
> > >   (vget_low_u8): Likewise.
> > >   (vget_low_u16): Likewise.
> > >   (vget_low_u32): Likewise.
> > >   (vget_low_u64): Likewise.
> > >   (vget_low_bf16): Likewise.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >   * gcc.target/aarch64/pr113573.c: Replace
> __builtin_aarch64_get_lowv8hi
> > >   with vget_low_s16.
> > >   * gcc.target/aarch64/vget_low_2.c: New test.
> > >   * gcc.target/aarch64/vget_low_2_be.c: New test.
> >
> > Ok, thanks.  I suppose the patch has the side effect of allowing
> > vget_low_bf16 to be called without +bf16.  IMO that's the correct
> > behaviour though, and is consistent with how we handle reinterprets.

Thanks, Richard! Yes, it does have the side effect you pointed out and is 
consistent with how reinterprets are handled currently.
> 
> Pushed as r15-697-ga2e4fe5a53cf75cd055f64e745ebd51253e42254 .

Thanks, Andrew!
> 
> Thanks,
> Andrew
> 
> >
> > Richard
> >
> > > Signed-off-by: Pengxuan Zheng 
> > > ---
> > >  gcc/config/aarch64/aarch64-builtins.cc|  60 ++
> > >  gcc/config/aarch64/aarch64-simd-builtins.def  |   5 +-
> > >  gcc/config/aarch64/aarch64-simd.md|  23 +---
> > >  gcc/config/aarch64/arm_neon.h | 105 --
> > >  gcc/testsuite/gcc.target/aarch64/pr113573.c   |   2 +-
> > >  gcc/testsuite/gcc.target/aarch64/vget_low_2.c |  30 +
> > >  .../gcc.target/aarch64/vget_low_2_be.c|  31 ++
> > >  7 files changed, 124 insertions(+), 132 deletions(-)  create mode
> > > 100644 gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> > >  create mode 100644 gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c
> > >
> > > diff --git a/gcc/config/aarch64/aarch64-builtins.cc
> > > b/gcc/config/aarch64/aarch64-builtins.cc
> > > index 75d21de1401..4afe7c86ae3 100644
> > > --- a/gcc/config/aarch64/aarch64-builtins.cc
> > > +++ b/gcc/config/aarch64/aarch64-builtins.cc
> > > @@ -658,6 +658,23 @@ static aarch64_simd_builtin_datum
> aarch64_simd_builtin_data[] = {
> > >VREINTERPRET_BUILTINS \
> > >VREINTERPRETQ_BUILTINS
> > >
> > > +#define AARCH64_SIMD_VGET_LOW_BUILTINS \
> > > +  VGET_LOW_BUILTIN(f16) \
> > > +  VGET_LOW_BUILTIN(f32) \
> > > +  VGET_LOW_BUILTIN(f64) \
> > > +  VGET_LOW_BUILTIN(p8) \
> > > +  VGET_LOW_BUILTIN(p16) \
> > > +  VGET_LOW_BUILTIN(p64) \
> > > +  VGET_LOW_BUILTIN(s8) \
> > > +  VGET_LOW_BUILTIN(s16) \
> > > +  VGET_LOW_BUILTIN(s32) \
> > > +  VGET_LOW_BUILTIN(s64) \
> > > +  VGET_LOW_BUILTIN(u8) \
> > > +  VGET_LOW_BUILTIN(u16) \
> > > +  VGET_LOW_BUILTIN(u32) \
> > > +  VGET_LOW_BUILTIN(u64) \
> > > +  VGET_LOW_BUILTIN(bf16)
> > > +
> > >  typedef struct
> > >  {
> > >const char *name;
> > &

[PATCH] aarch64: Fold vget_low_* intrinsics to BIT_FIELD_REF [PR102171]

2024-05-13 Thread Pengxuan Zheng
This patch folds vget_low_* intrinsics to BIT_FILED_REF to open up more
optimization opportunities for gimple optimizers.

While we are here, we also remove the vget_low_* definitions from arm_neon.h and
use the new intrinsics framework.

PR target/102171

gcc/ChangeLog:

* config/aarch64/aarch64-builtins.cc (AARCH64_SIMD_VGET_LOW_BUILTINS):
New macro to create definitions for all vget_low intrinsics.
(VGET_LOW_BUILTIN): Likewise.
(enum aarch64_builtins): Add vget_low function codes.
(aarch64_general_fold_builtin): Fold vget_low calls.
* config/aarch64/aarch64-simd-builtins.def: Delete vget_low builtins.
* config/aarch64/aarch64-simd.md (aarch64_get_low): Delete.
(aarch64_vget_lo_halfv8bf): Likewise.
* config/aarch64/arm_neon.h (__attribute__): Delete.
(vget_low_f16): Likewise.
(vget_low_f32): Likewise.
(vget_low_f64): Likewise.
(vget_low_p8): Likewise.
(vget_low_p16): Likewise.
(vget_low_p64): Likewise.
(vget_low_s8): Likewise.
(vget_low_s16): Likewise.
(vget_low_s32): Likewise.
(vget_low_s64): Likewise.
(vget_low_u8): Likewise.
(vget_low_u16): Likewise.
(vget_low_u32): Likewise.
(vget_low_u64): Likewise.
(vget_low_bf16): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/pr113573.c: Replace __builtin_aarch64_get_lowv8hi
with vget_low_s16.
* gcc.target/aarch64/vget_low_2.c: New test.
* gcc.target/aarch64/vget_low_2_be.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-builtins.cc|  60 ++
 gcc/config/aarch64/aarch64-simd-builtins.def  |   5 +-
 gcc/config/aarch64/aarch64-simd.md|  23 +---
 gcc/config/aarch64/arm_neon.h | 105 --
 gcc/testsuite/gcc.target/aarch64/pr113573.c   |   2 +-
 gcc/testsuite/gcc.target/aarch64/vget_low_2.c |  30 +
 .../gcc.target/aarch64/vget_low_2_be.c|  31 ++
 7 files changed, 124 insertions(+), 132 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vget_low_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 75d21de1401..4afe7c86ae3 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -658,6 +658,23 @@ static aarch64_simd_builtin_datum 
aarch64_simd_builtin_data[] = {
   VREINTERPRET_BUILTINS \
   VREINTERPRETQ_BUILTINS
 
+#define AARCH64_SIMD_VGET_LOW_BUILTINS \
+  VGET_LOW_BUILTIN(f16) \
+  VGET_LOW_BUILTIN(f32) \
+  VGET_LOW_BUILTIN(f64) \
+  VGET_LOW_BUILTIN(p8) \
+  VGET_LOW_BUILTIN(p16) \
+  VGET_LOW_BUILTIN(p64) \
+  VGET_LOW_BUILTIN(s8) \
+  VGET_LOW_BUILTIN(s16) \
+  VGET_LOW_BUILTIN(s32) \
+  VGET_LOW_BUILTIN(s64) \
+  VGET_LOW_BUILTIN(u8) \
+  VGET_LOW_BUILTIN(u16) \
+  VGET_LOW_BUILTIN(u32) \
+  VGET_LOW_BUILTIN(u64) \
+  VGET_LOW_BUILTIN(bf16)
+
 typedef struct
 {
   const char *name;
@@ -697,6 +714,9 @@ typedef struct
 #define VREINTERPRET_BUILTIN(A, B, L) \
   AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B,
 
+#define VGET_LOW_BUILTIN(A) \
+  AARCH64_SIMD_BUILTIN_VGET_LOW_##A,
+
 #undef VAR1
 #define VAR1(T, N, MAP, FLAG, A) \
   AARCH64_SIMD_BUILTIN_##T##_##N##A,
@@ -732,6 +752,7 @@ enum aarch64_builtins
   AARCH64_CRC32_BUILTIN_MAX,
   /* SIMD intrinsic builtins.  */
   AARCH64_SIMD_VREINTERPRET_BUILTINS
+  AARCH64_SIMD_VGET_LOW_BUILTINS
   /* ARMv8.3-A Pointer Authentication Builtins.  */
   AARCH64_PAUTH_BUILTIN_AUTIA1716,
   AARCH64_PAUTH_BUILTIN_PACIA1716,
@@ -823,8 +844,37 @@ static aarch64_fcmla_laneq_builtin_datum 
aarch64_fcmla_lane_builtin_data[] = {
  && SIMD_INTR_QUAL(A) == SIMD_INTR_QUAL(B) \
   },
 
+#undef VGET_LOW_BUILTIN
+#define VGET_LOW_BUILTIN(A) \
+  {"vget_low_" #A, \
+   AARCH64_SIMD_BUILTIN_VGET_LOW_##A, \
+   2, \
+   { SIMD_INTR_MODE(A, d), SIMD_INTR_MODE(A, q) }, \
+   { SIMD_INTR_QUAL(A), SIMD_INTR_QUAL(A) }, \
+   FLAG_AUTO_FP, \
+   false \
+  },
+
+#define AARCH64_SIMD_VGET_LOW_BUILTINS \
+  VGET_LOW_BUILTIN(f16) \
+  VGET_LOW_BUILTIN(f32) \
+  VGET_LOW_BUILTIN(f64) \
+  VGET_LOW_BUILTIN(p8) \
+  VGET_LOW_BUILTIN(p16) \
+  VGET_LOW_BUILTIN(p64) \
+  VGET_LOW_BUILTIN(s8) \
+  VGET_LOW_BUILTIN(s16) \
+  VGET_LOW_BUILTIN(s32) \
+  VGET_LOW_BUILTIN(s64) \
+  VGET_LOW_BUILTIN(u8) \
+  VGET_LOW_BUILTIN(u16) \
+  VGET_LOW_BUILTIN(u32) \
+  VGET_LOW_BUILTIN(u64) \
+  VGET_LOW_BUILTIN(bf16)
+
 static const aarch64_simd_intrinsic_datum aarch64_simd_intrinsic_data[] = {
   AARCH64_SIMD_VREINTERPRET_BUILTINS
+  AARCH64_SIMD_VGET_LOW_BUILTINS
 };
 
 
@@ -3216,6 +3266,9 @@ aarch64_fold_builtin_lane_check (tree arg0, tree arg1, 
tree arg2)
 #define VREINTERPRET_BUILTIN(A, B, L) \
   case AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B:
 
+#undef VGET_LOW_BUILTIN
+#define VGET_LOW_BU

[PATCH] aarch64: Add vector popcount besides QImode [PR113859]

2024-04-30 Thread Pengxuan Zheng
This patch improves GCC’s vectorization of __builtin_popcount for aarch64 target
by adding popcount patterns for vector modes besides QImode, i.e., HImode,
SImode and DImode.

With this patch, we now generate the following for HImode:
  cnt v1.16b, v.16b
  uaddlp  v2.8h, v1.16b

For SImode, we generate:
  cnt v1.16b, v.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h

For V2DI, we generate:
  cnt v1.16b, v.16b
  uaddlp  v2.8h, v1.16b
  uaddlp  v3.4s, v2.8h
  uaddlp  v4.2d, v3.4s

gcc/ChangeLog:

PR target/113859
* config/aarch64/aarch64-simd.md (popcount2): New define_expand.

gcc/testsuite/ChangeLog:

PR target/113859
* gcc.target/aarch64/popcnt-vec.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-simd.md| 40 
 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 48 +++
 2 files changed, 88 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-vec.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index f8bb973a278..093c32ee8ff 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3540,6 +3540,46 @@ (define_insn "popcount2"
   [(set_attr "type" "neon_cnt")]
 )
 
+(define_expand "popcount2"
+  [(set (match_operand:VQN 0 "register_operand" "=w")
+(popcount:VQN (match_operand:VQN 1 "register_operand" "w")))]
+  "TARGET_SIMD"
+  {
+rtx v = gen_reg_rtx (V16QImode);
+rtx v1 = gen_reg_rtx (V16QImode);
+emit_move_insn (v, gen_lowpart (V16QImode, operands[1]));
+emit_insn (gen_popcountv16qi2 (v1, v));
+if (mode == V8HImode)
+  {
+/* For V8HI, we generate:
+cnt v1.16b, v.16b
+uaddlp  v2.8h, v1.16b */
+emit_insn (gen_aarch64_uaddlpv16qi (operands[0], v1));
+DONE;
+  }
+rtx v2 = gen_reg_rtx (V8HImode);
+emit_insn (gen_aarch64_uaddlpv16qi (v2, v1));
+if (mode == V4SImode)
+  {
+/* For V4SI, we generate:
+cnt v1.16b, v.16b
+uaddlp  v2.8h, v1.16b
+uaddlp  v3.4s, v2.8h */
+emit_insn (gen_aarch64_uaddlpv8hi (operands[0], v2));
+DONE;
+  }
+/* For V2DI, we generate:
+cnt v1.16b, v.16b
+uaddlp  v2.8h, v1.16b
+uaddlp  v3.4s, v2.8h
+uaddlp  v4.2d, v3.4s */
+rtx v3 = gen_reg_rtx (V4SImode);
+emit_insn (gen_aarch64_uaddlpv8hi (v3, v2));
+emit_insn (gen_aarch64_uaddlpv4si (operands[0], v3));
+DONE;
+  }
+)
+
 ;; 'across lanes' max and min ops.
 
 ;; Template for outputting a scalar, so we can create __builtins which can be
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-vec.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
new file mode 100644
index 000..4c9a1b95990
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-vec.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* This function should produce cnt v.16b. */
+void
+bar (unsigned char *__restrict b, unsigned char *__restrict d)
+{
+  for (int i = 0; i < 1024; i++)
+d[i] = __builtin_popcount (b[i]);
+}
+
+/* This function should produce cnt v.16b and uaddlp (Add Long Pairwise). */
+void
+bar1 (unsigned short *__restrict b, unsigned short *__restrict d)
+{
+  for (int i = 0; i < 1024; i++)
+d[i] = __builtin_popcount (b[i]);
+}
+
+/* This function should produce cnt v.16b and 2 uaddlp (Add Long Pairwise). */
+void
+bar2 (unsigned int *__restrict b, unsigned int *__restrict d)
+{
+  for (int i = 0; i < 1024; i++)
+d[i] = __builtin_popcount (b[i]);
+}
+
+/* This function should produce cnt v.16b and 3 uaddlp (Add Long Pairwise). */
+void
+bar3 (unsigned long long *__restrict b, unsigned long long *__restrict d)
+{
+  for (int i = 0; i < 1024; i++)
+d[i] = __builtin_popcountll (b[i]);
+}
+
+/* SLP
+ This function should produce cnt v.16b and 3 uaddlp (Add Long Pairwise). */
+void
+bar4 (unsigned long long *__restrict b, unsigned long long *__restrict d)
+{
+  d[0] = __builtin_popcountll (b[0]);
+  d[1] = __builtin_popcountll (b[1]);
+}
+
+/* { dg-final { scan-assembler-not {\tbl\tpopcount} } } */
+/* { dg-final { scan-assembler-times {cnt\t} 5 } } */
+/* { dg-final { scan-assembler-times {uaddlp\t} 9 } } */
+/* { dg-final { scan-assembler-times {ldr\tq} 5 } } */
-- 
2.17.1