Re: [PATCH v4 4/5] Add tests for C/C++ musttail attributes

2024-02-02 Thread Prathamesh Kulkarni
On Fri, 2 Feb 2024 at 14:44, Andi Kleen  wrote:
>
> Mostly adopted from the existing C musttail plugin tests.
>
> gcc/testsuite/ChangeLog:
>
> * c-c++-common/musttail1.c: New test.
> * c-c++-common/musttail2.c: New test.
> * c-c++-common/musttail3.c: New test.
> * c-c++-common/musttail4.c: New test.
> * c-c++-common/musttail5.c: New test.
> ---
>  gcc/testsuite/c-c++-common/musttail1.c | 15 
>  gcc/testsuite/c-c++-common/musttail2.c | 34 ++
>  gcc/testsuite/c-c++-common/musttail3.c | 29 ++
>  gcc/testsuite/c-c++-common/musttail4.c | 17 +
>  gcc/testsuite/c-c++-common/musttail5.c | 25 +++
>  5 files changed, 120 insertions(+)
>  create mode 100644 gcc/testsuite/c-c++-common/musttail1.c
>  create mode 100644 gcc/testsuite/c-c++-common/musttail2.c
>  create mode 100644 gcc/testsuite/c-c++-common/musttail3.c
>  create mode 100644 gcc/testsuite/c-c++-common/musttail4.c
>  create mode 100644 gcc/testsuite/c-c++-common/musttail5.c
>
> diff --git a/gcc/testsuite/c-c++-common/musttail1.c 
> b/gcc/testsuite/c-c++-common/musttail1.c
> new file mode 100644
> index ..ac92f9f74616
> --- /dev/null
> +++ b/gcc/testsuite/c-c++-common/musttail1.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile { target { tail_call && { c || c++11 } } } } */
> +/* { dg-options "-O2" } */
> +/* { dg-additional-options "-fdelayed-branch" { target sparc*-*-* } } */
> +
> +int __attribute__((noinline,noclone,noipa))
> +callee (int i)
Hi Andy,
Sorry, I wasn't clear about this in previous patch -- noipa will
subsume other ipa attributes,
so there's no need to have noinline, noclone along with noipa.
int __attribute__((noipa)) callee(int i) should be sufficient for
disabling IPA optimizations involving callee.

Thanks,
Prathamesh

> +{
> +  return i * i;
> +}
> +
> +int __attribute__((noinline,noclone,noipa))
> +caller (int i)
> +{
> +  [[gnu::musttail]] return callee (i + 1);
> +}
> diff --git a/gcc/testsuite/c-c++-common/musttail2.c 
> b/gcc/testsuite/c-c++-common/musttail2.c
> new file mode 100644
> index ..058329b69cc2
> --- /dev/null
> +++ b/gcc/testsuite/c-c++-common/musttail2.c
> @@ -0,0 +1,34 @@
> +/* { dg-do compile { target { tail_call && { c || c++11 } } } } */
> +
> +struct box { char field[256]; int i; };
> +
> +int __attribute__((noinline,noclone,noipa))
> +test_2_callee (int i, struct box b)
> +{
> +  if (b.field[0])
> +return 5;
> +  return i * i;
> +}
> +
> +int __attribute__((noinline,noclone,noipa))
> +test_2_caller (int i)
> +{
> +  struct box b;
> +  [[gnu::musttail]] return test_2_callee (i + 1, b); /* { dg-error "cannot 
> tail-call: " } */
> +}
> +
> +extern void setjmp (void);
> +void
> +test_3 (void)
> +{
> +  [[gnu::musttail]] return setjmp (); /* { dg-error "cannot tail-call: " } */
> +}
> +
> +typedef void (fn_ptr_t) (void);
> +volatile fn_ptr_t fn_ptr;
> +
> +void
> +test_5 (void)
> +{
> +  [[gnu::musttail]] return fn_ptr (); /* { dg-error "cannot tail-call: " } */
> +}
> diff --git a/gcc/testsuite/c-c++-common/musttail3.c 
> b/gcc/testsuite/c-c++-common/musttail3.c
> new file mode 100644
> index ..ea9589c59ef2
> --- /dev/null
> +++ b/gcc/testsuite/c-c++-common/musttail3.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile { target { tail_call && { c || c++11 } } } } */
> +
> +extern int foo2 (int x, ...);
> +
> +struct str
> +{
> +  int a, b;
> +};
> +
> +struct str
> +cstruct (int x)
> +{
> +  if (x < 10)
> +[[clang::musttail]] return cstruct (x + 1);
> +  return ((struct str){ x, 0 });
> +}
> +
> +int
> +foo (int x)
> +{
> +  if (x < 10)
> +[[clang::musttail]] return foo2 (x, 29);
> +  if (x < 100)
> +{
> +  int k = foo (x + 1);
> +  [[clang::musttail]] return k;/* { dg-error "cannot tail-call: " } 
> */
> +}
> +  return x;
> +}
> diff --git a/gcc/testsuite/c-c++-common/musttail4.c 
> b/gcc/testsuite/c-c++-common/musttail4.c
> new file mode 100644
> index ..23f4b5e1cd68
> --- /dev/null
> +++ b/gcc/testsuite/c-c++-common/musttail4.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { tail_call && { c || c++11 } } } } */
> +
> +struct box { char field[64]; int i; };
> +
> +struct box __attribute__((noinline,noclone,noipa))
> +returns_struct (int i)
> +{
> +  struct box b;
> +  b.i = i * i;
> +  return b;
> +}
> +
> +int __attribute__((noinline,noclone))
> +test_1 (int i)
> +{
> +  [[gnu::musttail]] return returns_struct (i * 5).i; /* { dg-error "cannot 
> tail-call: " } */
> +}
> diff --git a/gcc/testsuite/c-c++-common/musttail5.c 
> b/gcc/testsuite/c-c++-common/musttail5.c
> new file mode 100644
> index ..71f4de40fc6d
> --- /dev/null
> +++ b/gcc/testsuite/c-c++-common/musttail5.c
> @@ -0,0 +1,25 @@
> +/* { dg-do compile } */
> +/* { dg-options "-std=c23" { target c } } */
> +/* { dg-options "-std=gnu++11" { target c++ } } */
> +
> +[[musttail]] int j; /* { dg-warning "attribute" } */
> +__attribute__((musttail)) int k; 

Re: [PATCH v3 4/5] Add tests for C/C++ musttail attributes

2024-01-31 Thread Prathamesh Kulkarni
On Wed, 31 Jan 2024 at 07:49, Andi Kleen  wrote:
>
> Mostly adopted from the existing C musttail plugin tests.
> ---
>  gcc/testsuite/c-c++-common/musttail1.c  | 17 
>  gcc/testsuite/c-c++-common/musttail2.c  | 36 +
>  gcc/testsuite/c-c++-common/musttail3.c  | 31 +
>  gcc/testsuite/c-c++-common/musttail4.c  | 19 +
>  gcc/testsuite/gcc.dg/musttail-invalid.c | 17 
>  5 files changed, 120 insertions(+)
>  create mode 100644 gcc/testsuite/c-c++-common/musttail1.c
>  create mode 100644 gcc/testsuite/c-c++-common/musttail2.c
>  create mode 100644 gcc/testsuite/c-c++-common/musttail3.c
>  create mode 100644 gcc/testsuite/c-c++-common/musttail4.c
>  create mode 100644 gcc/testsuite/gcc.dg/musttail-invalid.c
>
> diff --git a/gcc/testsuite/c-c++-common/musttail1.c 
> b/gcc/testsuite/c-c++-common/musttail1.c
> new file mode 100644
> index ..476185e3ed4b
> --- /dev/null
> +++ b/gcc/testsuite/c-c++-common/musttail1.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target tail_call } } */
> +/* { dg-options "-O2" } */
> +/* { dg-additional-options "-std=c++11" { target c++ } } */
> +/* { dg-additional-options "-std=c23" { target c } } */
> +/* { dg-additional-options "-fdelayed-branch" { target sparc*-*-* } } */
> +
> +int __attribute__((noinline,noclone))
Hi,
Sorry to nitpick -- Just wondering if it'd be slightly better to use
noipa attribute instead, assuming the intent is to disable IPA opts ?

Thanks,
Prathamesh


> +callee (int i)
> +{
> +  return i * i;
> +}
> +
> +int __attribute__((noinline,noclone))
> +caller (int i)
> +{
> +  [[gnu::musttail]] return callee (i + 1);
> +}
> diff --git a/gcc/testsuite/c-c++-common/musttail2.c 
> b/gcc/testsuite/c-c++-common/musttail2.c
> new file mode 100644
> index ..28f2f68ef13d
> --- /dev/null
> +++ b/gcc/testsuite/c-c++-common/musttail2.c
> @@ -0,0 +1,36 @@
> +/* { dg-do compile { target tail_call } } */
> +/* { dg-additional-options "-std=c++11" { target c++ } } */
> +/* { dg-additional-options "-std=c23" { target c } } */
> +
> +struct box { char field[256]; int i; };
> +
> +int __attribute__((noinline,noclone))
> +test_2_callee (int i, struct box b)
> +{
> +  if (b.field[0])
> +return 5;
> +  return i * i;
> +}
> +
> +int __attribute__((noinline,noclone))
> +test_2_caller (int i)
> +{
> +  struct box b;
> +  [[gnu::musttail]] return test_2_callee (i + 1, b); /* { dg-error "cannot 
> tail-call: " } */
> +}
> +
> +extern void setjmp (void);
> +void
> +test_3 (void)
> +{
> +  [[gnu::musttail]] return setjmp (); /* { dg-error "cannot tail-call: " } */
> +}
> +
> +typedef void (fn_ptr_t) (void);
> +volatile fn_ptr_t fn_ptr;
> +
> +void
> +test_5 (void)
> +{
> +  [[gnu::musttail]] return fn_ptr (); /* { dg-error "cannot tail-call: " } */
> +}
> diff --git a/gcc/testsuite/c-c++-common/musttail3.c 
> b/gcc/testsuite/c-c++-common/musttail3.c
> new file mode 100644
> index ..fdbb292944ad
> --- /dev/null
> +++ b/gcc/testsuite/c-c++-common/musttail3.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile { target tail_call } } */
> +/* { dg-additional-options "-std=c++11" { target c++ } } */
> +/* { dg-additional-options "-std=c23" { target c } } */
> +
> +extern int foo2 (int x, ...);
> +
> +struct str
> +{
> +  int a, b;
> +};
> +
> +struct str
> +cstruct (int x)
> +{
> +  if (x < 10)
> +[[clang::musttail]] return cstruct (x + 1);
> +  return ((struct str){ x, 0 });
> +}
> +
> +int
> +foo (int x)
> +{
> +  if (x < 10)
> +[[clang::musttail]] return foo2 (x, 29);
> +  if (x < 100)
> +{
> +  int k = foo (x + 1);
> +  [[clang::musttail]] return k;/* { dg-error "cannot tail-call: " } 
> */
> +}
> +  return x;
> +}
> diff --git a/gcc/testsuite/c-c++-common/musttail4.c 
> b/gcc/testsuite/c-c++-common/musttail4.c
> new file mode 100644
> index ..7bf44816f14a
> --- /dev/null
> +++ b/gcc/testsuite/c-c++-common/musttail4.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile { target tail_call } } */
> +/* { dg-additional-options "-std=c++11" { target c++ } } */
> +/* { dg-additional-options "-std=c23" { target c } } */
> +
> +struct box { char field[64]; int i; };
> +
> +struct box __attribute__((noinline,noclone))
> +returns_struct (int i)
> +{
> +  struct box b;
> +  b.i = i * i;
> +  return b;
> +}
> +
> +int __attribute__((noinline,noclone))
> +test_1 (int i)
> +{
> +  [[gnu::musttail]] return returns_struct (i * 5).i; /* { dg-error "cannot 
> tail-call: " } */
> +}
> diff --git a/gcc/testsuite/gcc.dg/musttail-invalid.c 
> b/gcc/testsuite/gcc.dg/musttail-invalid.c
> new file mode 100644
> index ..c4725b4b8226
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/musttail-invalid.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-std=c23" } */
> +
> +[[musttail]] int j; /* { dg-warning "attribute ignored" } */
> +__attribute__((musttail)) int k; /* { dg-warning "attribute directive 
> ignored" } */
> +
> +void foo(void)
> +{
> +   

Re: [PATCH] aarch64: Fix ICE in poly-int.h due to SLP.

2024-01-30 Thread Prathamesh Kulkarni
On Tue, 30 Jan 2024 at 20:13, Richard Ball  wrote:
>
> Adds a check to ensure that the input vector arguments
> to a function are not variable length. Previously, only the
> output vector of a function was checked.
Hi,
Quoting from patch:
@@ -8989,6 +8989,14 @@ vectorizable_slp_permutation_1 (vec_info
*vinfo, gimple_stmt_iterator *gsi,
   instead of relying on the pattern described above.  */
   if (!nunits.is_constant ())
  return -1;
+  FOR_EACH_VEC_ELT (children, i, child)
+ if (SLP_TREE_VECTYPE (child))
+   {
+ tree child_vectype = SLP_TREE_VECTYPE (child);
+ poly_uint64 child_nunits = TYPE_VECTOR_SUBPARTS (child_vectype);
+ if (!child_nunits.is_constant ())
+   return -1;
+   }

Just wondering if that'd be equivalent to checking:
if (!TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
  return -1;
Instead of (again) iterating over children since we bail out in the
function above,
if SLP_TREE_VECTYPE (child) and op_vectype are not compatible types ?

Also, could you please include the offending test-case in the patch ?

Thanks,
Prathamesh

>
> gcc/ChangeLog:
>
> * tree-vect-slp.cc (vectorizable_slp_permutation_1):
> Add variable-length check for vector input arguments
> to a function.


Re: [aarch64] PR112950: gcc.target/aarch64/sve/acle/general/dupq_5.c fails on aarch64_be-linux-gnu

2024-01-29 Thread Prathamesh Kulkarni
On Sat, 27 Jan 2024 at 21:19, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > Hi,
> > The test passes -mlittle-endian option but doesn't have target check
> > for aarch64_little_endian and thus fails to compile on
> > aarch64_be-linux-gnu. The patch adds the missing aarch64_little_endian
> > target check, which makes it unsupported on the target.
> > OK to commit ?
> >
> > Thanks,
> > Prathamesh
> >
> > PR112950: Add aarch64_little_endian target check for dupq_5.c
> >
> > gcc/testsuite/ChangeLog:
> >   PR target/112950
> >   * gcc.target/aarch64/sve/acle/general/dupq_5.c: Add
> >   aarch64_little_endian target check.
>
> If we add this requirement, then there's no need to pass -mlittle-endian
> in the dg-options.
>
> But dupq_6.c (the corresponding big-endian test) has:
>
>   /* To avoid needing big-endian header files.  */
>   #pragma GCC aarch64 "arm_sve.h"
>
> instead of:
>
>   #include 
>
> Could you do the same thing here?
That worked, thanks! And it also makes dupq_5.c pass on aarch64_be-linux-gnu.

Thanks,
Prathamesh

>
> Thanks,
> Richard
>
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c 
> > b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c
> > index 6ae8d4c60b2..1990412d0e5 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c
> > @@ -1,5 +1,6 @@
> >  /* { dg-do compile } */
> >  /* { dg-options "-O2 -mlittle-endian" } */
> > +/* { dg-require-effective-target aarch64_little_endian } */
> >
> >  #include 
> >
PR112950: Use #pragma GCC for including arm_sve.h. 

gcc/testsuite/ChangeLog:
PR target/112950
* gcc.target/aarch64/sve/acle/general/dupq_5.c: Remove include directive
and instead use #pragma GCC for including arm_sve.h.

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c
index 6ae8d4c60b2..e88477b6379 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -mlittle-endian" } */
 
-#include 
+#pragma GCC aarch64 "arm_sve.h"
 
 svint32_t
 dupq (int x1, int x2, int x3, int x4)


[aarch64] PR112950: gcc.target/aarch64/sve/acle/general/dupq_5.c fails on aarch64_be-linux-gnu

2024-01-27 Thread Prathamesh Kulkarni
Hi,
The test passes -mlittle-endian option but doesn't have target check
for aarch64_little_endian and thus fails to compile on
aarch64_be-linux-gnu. The patch adds the missing aarch64_little_endian
target check, which makes it unsupported on the target.
OK to commit ?

Thanks,
Prathamesh
PR112950: Add aarch64_little_endian target check for dupq_5.c

gcc/testsuite/ChangeLog:
PR target/112950
* gcc.target/aarch64/sve/acle/general/dupq_5.c: Add
aarch64_little_endian target check.

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c
index 6ae8d4c60b2..1990412d0e5 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c
@@ -1,5 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -mlittle-endian" } */
+/* { dg-require-effective-target aarch64_little_endian } */
 
 #include 
 


Re: [PATCH] cse: Fix handling of fake vec_select sets [PR111702]

2023-12-26 Thread Prathamesh Kulkarni
On Thu, 21 Dec 2023 at 00:00, Richard Sandiford
 wrote:
>
> If cse sees:
>
>   (set (reg R) (const_vector [A B ...]))
>
> it creates fake sets of the form:
>
>   (set R[0] A)
>   (set R[1] B)
>   ...
>
> (with R[n] replaced by appropriate rtl) and then adds them to the tables
> in the same way as for normal sets.  This allows a sequence like:
>
>   (set (reg R2) A)
>   ...(reg R2)...
>
> to try to use R[0] instead of (reg R2).
>
> But the pass was taking the analogy too far, and was trying to simplify
> these fake sets based on costs.  That is, if there was an earlier:
>
>   (set (reg T) A)
>
> the pass would go to considerable effort trying to work out whether:
>
>   (set R[0] A)
>
> or:
>
>   (set R[0] (reg T))
>
> was more profitable.  This included running validate*_change on the sets,
> which has no meaning given that the sets are not part of the insn.
>
> In this example, the equivalence A == T is already known, and the
> purpose of the fake sets is to add A == T == R[0].  We can do that
> just as easily (or, as the PR shows, more easily) if we keep the
> original form of the fake set, with A instead of T.
>
> The problem in the PR occurred if we had:
>
> (1) something that establishes an equivalence between a vector V1 of
> M-bit scalar integers and a hard register H
>
> (2) something that establishes an equivalence between a vector V2 of
> N-bit scalar integers, where N instances of V1[0]
>
> (1) established an equivalence between V1[0] and H in M bits.
> (2) then triggered a search for an equivalence of V1[0] in N bits.
> This included:
>
>   /* See if we have a CONST_INT that is already in a register in a
>  wider mode.  */
>
> which (correctly) found that the low N bits of H contain the right value.
> But because it came from a wider mode, this equivalence between N-bit H
> and N-bit V1[0] was not yet in the hash table.  It therefore survived
> the purge in:
>
>   /* At this point, ELT, if nonzero, points to a class of expressions
>  equivalent to the source of this SET and SRC, SRC_EQV, SRC_FOLDED,
>  and SRC_RELATED, if nonzero, each contain additional equivalent
>  expressions.  Prune these latter expressions by deleting expressions
>  already in the equivalence class.
>
> And since more than 1 set found the same N-bit equivalence between
> H and V1[0], the pass tried to add it more than once.
>
> Things were already wrong at this stage, but an ICE was only triggered
> later when trying to merge this N-bit equivalence with another one.
>
> We could avoid the double registration by adding:
>
>   for (elt = classp; elt; elt = elt->next_same_value)
> if (rtx_equal_p (elt->exp, x))
>   return elt;
>
> to insert_with_costs, or by making cse_insn check whether previous
> sets have recorded the same equivalence.  The latter seems more
> appealing from a compile-time perspective.  But in this case,
> doing that would be adding yet more spurious work to the handling
> of fake sets.
>
> The handling of fake sets therefore seems like the more fundamental bug.
>
> While there, the patch also makes sure that we don't apply REG_EQUAL
> notes to these fake sets.  They only describe the "real" (first) set.
Hi Richard,
Thanks for the detailed explanation and fix!

Thanks,
Prathamesh
>
> gcc/
> PR rtl-optimization/111702
> * cse.cc (set::mode): Move earlier.
> (set::src_in_memory, set::src_volatile): Convert to bitfields.
> (set::is_fake_set): New member variable.
> (add_to_set): Add an is_fake_set parameter.
> (find_sets_in_insn): Update calls accordingly.
> (cse_insn): Do not apply REG_EQUAL notes to fake sets.  Do not
> try to optimize them either, or validate changes to them.
>
> gcc/
> PR rtl-optimization/111702
> * gcc.dg/rtl/aarch64/pr111702.c: New test.
> ---
>  gcc/cse.cc  | 38 +++---
>  gcc/testsuite/gcc.dg/rtl/aarch64/pr111702.c | 43 +
>  2 files changed, 67 insertions(+), 14 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/rtl/aarch64/pr111702.c
>
> diff --git a/gcc/cse.cc b/gcc/cse.cc
> index f9603fdfd43..9fd51ca2832 100644
> --- a/gcc/cse.cc
> +++ b/gcc/cse.cc
> @@ -4128,13 +4128,17 @@ struct set
>unsigned dest_hash;
>/* The SET_DEST, with SUBREG, etc., stripped.  */
>rtx inner_dest;
> +  /* Original machine mode, in case it becomes a CONST_INT.  */
> +  ENUM_BITFIELD(machine_mode) mode : MACHINE_MODE_BITSIZE;
>/* Nonzero if the SET_SRC is in memory.  */
> -  char src_in_memory;
> +  unsigned int src_in_memory : 1;
>/* Nonzero if the SET_SRC contains something
>   whose value cannot be predicted and understood.  */
> -  char src_volatile;
> -  /* Original machine mode, in case it becomes a CONST_INT.  */
> -  ENUM_BITFIELD(machine_mode) mode : MACHINE_MODE_BITSIZE;
> +  unsigned int src_volatile : 1;
> +  /* Nonzero if RTL is an artifical set that has been 

Re: [aarch64] PR111702 - ICE in insert_regs after interleave+zip1 vector initialization patch

2023-12-19 Thread Prathamesh Kulkarni
On Mon, 4 Dec 2023 at 14:44, Prathamesh Kulkarni
 wrote:
>
> On Thu, 23 Nov 2023 at 17:06, Prathamesh Kulkarni
>  wrote:
> >
> > Hi Richard,
> > For the test-case mentioned in PR111702, compiling with -O2
> > -frounding-math -fstack-protector-all results in following ICE during
> > cse2 pass:
> >
> > test.c: In function 'foo':
> > test.c:119:1: internal compiler error: in insert_regs, at cse.cc:1120
> >   119 | }
> >   | ^
> > 0xb7ebb0 insert_regs
> > ../../gcc/gcc/cse.cc:1120
> > 0x1f95134 merge_equiv_classes
> > ../../gcc/gcc/cse.cc:1764
> > 0x1f9b9ab cse_insn
> > ../../gcc/gcc/cse.cc:4793
> > 0x1f9fe30 cse_extended_basic_block
> > ../../gcc/gcc/cse.cc:6577
> > 0x1f9fe30 cse_main
> > ../../gcc/gcc/cse.cc:6722
> > 0x1fa0984 rest_of_handle_cse2
> > ../../gcc/gcc/cse.cc:7620
> > 0x1fa0984 execute
> > ../../gcc/gcc/cse.cc:7675
> >
> > This happens only with interleave+zip1 vector initialization with
> > -frounding-math -fstack-protector-all, while it compiles OK without
> > -fstack-protector-all. Also, it compiles OK with fallback sequence
> > code-gen (with or without -fstack-protector-all). Unfortunately, I
> > haven't been able to reduce the test-case further :/
> >
> > From the test-case, it seems only the vector initializer for type J
> > uses interleave+zip1 approach, while rest of the vector initializers
> > use fallback sequence.
> >
> > J is defined as:
> > typedef _Float16 __attribute__((__vector_size__ (16))) J;
> >
> > and the initializer is:
> > (J) { 11654, 4801, 5535, 9743, 61680}
> >
> > interleave+zip1 sequence for above initializer J:
> > mode = V8HF
> >
> > vals: (parallel:V8HF [
> > (reg:HF 642)
> > (reg:HF 645)
> > (reg:HF 648)
> > (reg:HF 651)
> > (reg:HF 654)
> > (const_double:HF 0.0 [0x0.0p+0]) repeated x3
> > ])
> >
> > target: (reg:V8HF 641)
> > seq:
> > (insn 1058 0 1059 (set (reg:V4HF 657)
> > (const_vector:V4HF [
> > (const_double:HF 0.0 [0x0.0p+0]) repeated x4
> > ])) "test.c":81:8 -1
> >  (nil))
> > (insn 1059 1058 1060 (set (reg:V4HF 657)
> > (vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 642))
> > (reg:V4HF 657)
> > (const_int 1 [0x1]))) "test.c":81:8 -1
> >  (nil))
> > (insn 1060 1059 1061 (set (reg:V4HF 657)
> > (vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 648))
> > (reg:V4HF 657)
> > (const_int 2 [0x2]))) "test.c":81:8 -1
> >  (nil))
> > (insn 1061 1060 1062 (set (reg:V4HF 657)
> > (vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 654))
> > (reg:V4HF 657)
> > (const_int 4 [0x4]))) "test.c":81:8 -1
> >  (nil))
> > (insn 1062 1061 1063 (set (reg:V4HF 658)
> > (const_vector:V4HF [
> > (const_double:HF 0.0 [0x0.0p+0]) repeated x4
> > ])) "test.c":81:8 -1
> >  (nil))
> > (insn 1063 1062 1064 (set (reg:V4HF 658)
> > (vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 645))
> > (reg:V4HF 658)
> > (const_int 1 [0x1]))) "test.c":81:8 -1
> >  (nil))
> > (insn 1064 1063 1065 (set (reg:V4HF 658)
> > (vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 651))
> > (reg:V4HF 658)
> > (const_int 2 [0x2]))) "test.c":81:8 -1
> >  (nil))
> > (insn 1065 1064 0 (set (reg:V8HF 641)
> > (unspec:V8HF [
> > (subreg:V8HF (reg:V4HF 657) 0)
> > (subreg:V8HF (reg:V4HF 658) 0)
> > ] UNSPEC_ZIP1)) "test.c":81:8 -1
> >  (nil))
> >
> > It seems to me that the above sequence correctly initializes the
> > vector into r641 ?
> > insns 1058-1061 construct r657 = { r642, r648, r654, 0 }
> > insns 1062-1064 construct r658 = { r645, r651, 0, 0 }
> > and zip1 will create r641 = { r642, r645, r648, r651, r654, 0, 0, 0 }
> >
> > For the above test, it seems that with interleave+zip1 approach and
> > -fstack-protector-all,
> > in cse pass, there are two separate equivalence classes created for
> > (const_int 1), that need
> > to be merged in cse_insn:
> >
> >if (elt->first_same_value != src_eqv_elt->first_same_value)
> > {
> >   /

Re: [PATCH 4/5] aarch64: rcpc3: add Neon ACLE wrapper functions to `arm_neon.h'

2023-12-07 Thread Prathamesh Kulkarni
On Thu, 9 Nov 2023 at 19:44, Victor Do Nascimento
 wrote:
>
> Create the necessary mappings from the ACLE-defined Neon intrinsics
> names[1] to the internal builtin function names.
>
> [1] https://arm-software.github.io/acle/neon_intrinsics/advsimd.html
Hi Victor,
It seems this patch broke kernel build after the recent patch to
upgrade -Wincompatible-pointer-types to an error:

00:00:56 
/home/tcwg-buildslave/workspace/tcwg_kernel_1/abe/builds/destdir/x86_64-pc-linux-gnu/lib/gcc/aarch64-linux-gnu/14.0.0/include/arm_neon.h:
In function ‘vldap1_lane_s64’:
00:00:56 
/home/tcwg-buildslave/workspace/tcwg_kernel_1/abe/builds/destdir/x86_64-pc-linux-gnu/lib/gcc/aarch64-linux-gnu/14.0.0/include/arm_neon.h:13474:48:
error: passing argument 1 of ‘__builtin_aarch64_vec_ldap1_lanev1di’
from incompatible pointer type [-Wincompatible-pointer-types]
00:00:56 13474 |   return __builtin_aarch64_vec_ldap1_lanev1di (__src,
__vec, __lane);
00:00:56   |^
00:00:56   ||
00:00:56   |const
int64_t * {aka const long long int *}
00:00:56 
/home/tcwg-buildslave/workspace/tcwg_kernel_1/abe/builds/destdir/x86_64-pc-linux-gnu/lib/gcc/aarch64-linux-gnu/14.0.0/include/arm_neon.h:13474:48:
note: expected ‘const long int *’ but argument is of type ‘const
int64_t *’ {aka ‘const long long int *’}

Looking cursorily at the code, should __src be casted to
(__builtin_aarch64_simd_di *) before passing it to
__builtin_aarch64_vec_ldap1_lanev1di ?
For more details, please see:
https://ci.linaro.org/job/tcwg_kernel--gnu-master-aarch64-next-defconfig-build/91/artifact/artifacts/notify/mail-body.txt/*view*/

Thanks,
Prathamesh


>
> gcc/ChangeLog:
>
> * gcc/config/aarch64/arm_neon.h (vldap1_lane_u64): New.
> (vldap1q_lane_u64): Likewise.
> (vldap1_lane_s64): Likewise.
> (vldap1q_lane_s64): Likewise.
> (vldap1_lane_f64): Likewise.
> (vldap1q_lane_f64): Likewise.
> (vldap1_lane_p64): Likewise.
> (vldap1q_lane_p64): Likewise.
> (vstl1_lane_u64): Likewise.
> (vstl1q_lane_u64): Likewise.
> (vstl1_lane_s64): Likewise.
> (vstl1q_lane_s64): Likewise.
> (vstl1_lane_f64): Likewise.
> (vstl1q_lane_f64): Likewise.
> (vstl1_lane_p64): Likewise.
> (vstl1q_lane_p64): Likewise.
> ---
>  gcc/config/aarch64/arm_neon.h | 129 ++
>  1 file changed, 129 insertions(+)
>
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 349f3167699..ef0d75e07ce 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -13446,6 +13446,135 @@ vld1q_lane_u64 (const uint64_t *__src, uint64x2_t 
> __vec, const int __lane)
>return __aarch64_vset_lane_any (*__src, __vec, __lane);
>  }
>
> +#pragma GCC push_options
> +#pragma GCC target ("+nothing+rcpc3+simd")
> +
> +/* vldap1_lane.  */
> +
> +__extension__ extern __inline uint64x1_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vldap1_lane_u64 (const uint64_t *__src, uint64x1_t __vec, const int __lane)
> +{
> +  return __builtin_aarch64_vec_ldap1_lanev1di_usus (
> + (__builtin_aarch64_simd_di *) __src, __vec, __lane);
> +}
> +
> +__extension__ extern __inline uint64x2_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vldap1q_lane_u64 (const uint64_t *__src, uint64x2_t __vec, const int __lane)
> +{
> +  return __builtin_aarch64_vec_ldap1_lanev2di_usus (
> + (__builtin_aarch64_simd_di *) __src, __vec, __lane);
> +}
> +
> +__extension__ extern __inline int64x1_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vldap1_lane_s64 (const int64_t *__src, int64x1_t __vec, const int __lane)
> +{
> +  return __builtin_aarch64_vec_ldap1_lanev1di (__src, __vec, __lane);
> +}
> +
> +__extension__ extern __inline int64x2_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vldap1q_lane_s64 (const int64_t *__src, int64x2_t __vec, const int __lane)
> +{
> +  return __builtin_aarch64_vec_ldap1_lanev2di (__src, __vec, __lane);
> +}
> +
> +__extension__ extern __inline float64x1_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vldap1_lane_f64 (const float64_t *__src, float64x1_t __vec, const int __lane)
> +{
> +  return __builtin_aarch64_vec_ldap1_lanev1df (__src, __vec, __lane);
> +}
> +
> +__extension__ extern __inline float64x2_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vldap1q_lane_f64 (const float64_t *__src, float64x2_t __vec, const int 
> __lane)
> +{
> +  return __builtin_aarch64_vec_ldap1_lanev2df (__src, __vec, __lane);
> +}
> +
> +__extension__ extern __inline poly64x1_t
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vldap1_lane_p64 (const poly64_t *__src, poly64x1_t __vec, 

Re: [PATCH v3 10/11] c: Turn -Wincompatible-pointer-types into a permerror

2023-12-06 Thread Prathamesh Kulkarni
On Mon, 20 Nov 2023 at 15:28, Florian Weimer  wrote:
>
> The change to build_conditional_expr drops the downgrade
> from a pedwarn to warning for builtins for C99 and later
> language dialects.  It remains a warning in C89 mode (not
> a permerror), as the -std=gnu89 -fno-permissive test shows.
Hi Florian,
It seems this patch caused a fallout for
gcc.dg/fixed-point/composite-type.c on arm, where the tests for
warnings fail.
For instance:
FAIL: gcc.dg/fixed-point/composite-type.c  (test for warnings, line 71)
Excess errors:
/home/tcwg-buildslave/workspace/tcwg_gnu_1/abe/snapshots/gcc.git~master/gcc/testsuite/gcc.dg/fixed-point/composite-type.c:71:3:
error: passing argument 1 of 'f2_sf' from incompatible pointer type
[-Wincompatible-pointer-types]
/home/tcwg-buildslave/workspace/tcwg_gnu_1/abe/snapshots/gcc.git~master/gcc/testsuite/gcc.dg/fixed-point/composite-type.c:71:3:
error: passing argument 1 of 'f2_sf' from incompatible pointer type
[-Wincompatible-pointer-types]
(snipped rest)

Should these warnings be now upgraded to dg-error ?

Thanks,
Prathamesh
>
> gcc/
>
> * doc/invoke.texi (Warning Options): Document changes.
>
> gcc/c/
>
> PR c/96284
> * c-typeck.cc (build_conditional_expr): Upgrade most pointer
> type mismatches to a permerror.
> (convert_for_assignment): Use permerror_opt and
> permerror_init for OPT_Wincompatible_pointer_types warnings.
>
> gcc/testsuite/
>
> * gcc.dg/permerror-default.c (incompatible_pointer_types):
> Expect new permerror.
> * gcc.dg/permerror-gnu89-nopermissive.c
> (incompatible_pointer_types):   Likewise.
> * gcc.dg/permerror-pedantic.c (incompatible_pointer_types):
> Likewise.
> * gcc.dg/permerror-system.c: Likewise.
> * gcc.dg/Wincompatible-pointer-types-2.c: Compile with
> -fpermissivedue to expected errors.
> * gcc.dg/Wincompatible-pointer-types-5.c: New test.  Copied
> from gcc.dg/Wincompatible-pointer-types-2.c.  Expect errors.
> * gcc.dg/anon-struct-11.c: Compile with -fpermissive
> due to expected errors.
> * gcc.dg/anon-struct-11a.c: New test.  Copied from
> gcc.dg/anon-struct-11.c.  Expect errors.
> * gcc.dg/anon-struct-13.c: Compile with -fpermissive
> due to expected errors.
> * gcc.dg/anon-struct-13a.c: New test.  Copied from
> gcc.dg/anon-struct-13.c.  Expect errors.
> * gcc.dg/builtin-arith-overflow-4.c: Compile with -fpermissive
> due to expected errors.
> * gcc.dg/builtin-arith-overflow-4a.c: New test.  Copied from
> gcc.dg/builtin-arith-overflow-4.c.  Expect errors.
> * gcc.dg/c23-qual-4.c: Expect -Wincompatible-pointer-types errors.
> * gcc.dg/dfp/composite-type.c: Compile with -fpermissive
> due to expected errors.
> * gcc.dg/dfp/composite-type-2.c: New test.  Copied from
> gcc.dg/dfp/composite-type.c.  Expect errors.
> * gcc.dg/diag-aka-1.c: Compile with -fpermissive
> due to expected errors.
> * gcc.dg/diag-aka-1a.c: New test.  Copied from gcc.dg/diag-aka-1a.c.
> Expect errors.
> * gcc.dg/enum-compat-1.c: Compile with -fpermissive
> due to expected errors.
> * gcc.dg/enum-compat-2.c: New test.  Copied from
> gcc.dg/enum-compat-1.c.  Expect errors.
> * gcc.dg/func-ptr-conv-1.c: Compile with -fpermissive
> due to expected errors.
> * gcc.dg/func-ptr-conv-2.c: New test.  Copied from
> gcc.dg/func-ptr-conv-1.c.  Expect errors.
> * gcc.dg/init-bad-7.c: Compile with -fpermissive
> due to expected errors.
> * gcc.dg/init-bad-7a.c: New test.  Copied from gcc.dg/init-bad-7.c.
> Expect errors.
> * gcc.dg/noncompile/incomplete-3.c (foo): Expect
> -Wincompatible-pointer-types error.
> * gcc.dg/param-type-mismatch-2.c (test8): Likewise.
> * gcc.dg/pointer-array-atomic.c: Compile with -fpermissive
> due to expected errors.
> * gcc.dg/pointer-array-atomic-2.c: New test.  Copied from
> gcc.dg/pointer-array-atomic.c.  Expect errors.
> * gcc.dg/pointer-array-quals-1.c (test): Expect
> -Wincompatible-pointer-types errors.
> * gcc.dg/transparent-union-1.c: Compile with -fpermissive
> due to expected errors.
> * gcc.dg/transparent-union-1a.c: New test.  Copied from
> gcc.dg/transparent-union-1.c.  Expect errors.
> * gcc.target/aarch64/acle/memtag_2a.c
> (test_memtag_warning_return_qualifier): Expect additional
> errors.
> * gcc.target/aarch64/sve/acle/general-c/load_2.c (f1): Likewise.
> * gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_1.c
> (f1): Likewise.
> * gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_2.c
> (f1): Likewise.
> * 

Re: [PATCH v8] c++: implement P2564, consteval needs to propagate up [PR107687]

2023-12-06 Thread Prathamesh Kulkarni
On Tue, 5 Dec 2023 at 06:18, Marek Polacek  wrote:
>
> On Mon, Dec 04, 2023 at 04:49:29PM -0500, Jason Merrill wrote:
> > On 12/4/23 15:23, Marek Polacek wrote:
> > > +/* FN is not a consteval function, but may become one.  Remember to
> > > +   escalate it after all pending templates have been instantiated.  */
> > > +
> > > +void
> > > +maybe_store_immediate_escalating_fn (tree fn)
> > > +{
> > > +  if (unchecked_immediate_escalating_function_p (fn))
> > > +remember_escalating_expr (fn);
> > > +}
> >
> > > +++ b/gcc/cp/decl.cc
> > > @@ -18441,7 +18441,10 @@ finish_function (bool inline_p)
> > >if (!processing_template_decl
> > >&& !DECL_IMMEDIATE_FUNCTION_P (fndecl)
> > >&& !DECL_OMP_DECLARE_REDUCTION_P (fndecl))
> > > -cp_fold_function (fndecl);
> > > +{
> > > +  cp_fold_function (fndecl);
> > > +  maybe_store_immediate_escalating_fn (fndecl);
> > > +}
> >
> > I think maybe_store_, and the call to it from finish_function, are unneeded;
> > we will have already decided whether we need to remember the function during
> > the call to cp_fold_function.
>
> 'Tis true.
>
> > OK with that change.
>
> Here's what I pushed after another regtest.  Thanks!
Hi Marek,
It seems the patch caused following regressions on aarch64:

Running g++:g++.dg/modules/modules.exp ...
FAIL: g++.dg/modules/xtreme-header-4_b.C -std=c++2b (internal compiler
error: tree check: expected class 'type', have 'declaration'
(template_decl) in get_originating_module_decl, at cp/module.cc:18659)
FAIL: g++.dg/modules/xtreme-header-5_b.C -std=c++2b (internal compiler
error: tree check: expected class 'type', have 'declaration'
(template_decl) in get_originating_module_decl, at cp/module.cc:18659)
FAIL: g++.dg/modules/xtreme-header_b.C -std=c++2b (internal compiler
error: tree check: expected class 'type', have 'declaration'
(template_decl) in get_originating_module_decl, at cp/module.cc:18659)

Log files: 
https://ci.linaro.org/job/tcwg_gcc_check--master-aarch64-build/1299/artifact/artifacts/00-sumfiles/

Thanks,
Prathamesh
>
> -- >8 --
> This patch implements P2564, described at , whereby
> certain functions are promoted to consteval.  For example:
>
>   consteval int id(int i) { return i; }
>
>   template 
>   constexpr int f(T t)
>   {
> return t + id(t); // id causes f to be promoted to consteval
>   }
>
>   void g(int i)
>   {
> f (3);
>   }
>
> now compiles.  Previously the code was ill-formed: we would complain
> that 't' in 'f' is not a constant expression.  Since 'f' is now
> consteval, it means that the call to id(t) is in an immediate context,
> so doesn't have to produce a constant -- this is how we allow consteval
> functions composition.  But making 'f' consteval also means that
> the call to 'f' in 'g' must yield a constant; failure to do so results
> in an error.  I made the effort to have cc1plus explain to us what's
> going on.  For example, calling f(i) produces this neat diagnostic:
>
> w.C:11:11: error: call to consteval function 'f(i)' is not a constant 
> expression
>11 | f (i);
>   | ~~^~~
> w.C:11:11: error: 'i' is not a constant expression
> w.C:6:22: note: 'constexpr int f(T) [with T = int]' was promoted to an 
> immediate function because its body contains an immediate-escalating 
> expression 'id(t)'
> 6 | return t + id(t); // id causes f to be promoted to 
> consteval
>   |~~^~~
>
> which hopefully makes it clear what's going on.
>
> Implementing this proposal has been tricky.  One problem was delayed
> instantiation: instantiating a function can set off a domino effect
> where one call promotes a function to consteval but that then means
> that another function should also be promoted, etc.
>
> In v1, I addressed the delayed instantiation problem by instantiating
> trees early, so that we can escalate functions right away.  That caused
> a number of problems, and in certain cases, like consteval-prop3.C, it
> can't work, because we need to wait till EOF to see the definition of
> the function anyway.  Overeager instantiation tends to cause diagnostic
> problems too.
>
> In v2, I attempted to move the escalation to the gimplifier, at which
> point all templates have been instantiated.  That attempt flopped,
> however, because once we've gimplified a function, its body is discarded
> and as a consequence, you can no longer evaluate a call to that function
> which is required for escalating, which needs to decide if a call is
> a constant expression or not.
>
> Therefore, we have to perform the escalation before gimplifying, but
> after instantiate_pending_templates.  That's not easy because we have
> no way to walk all the trees.  In the v2 patch, I use two vectors: one
> to store function decls that may become consteval, and another to
> remember references to immediate-escalating functions.  Unfortunately
> the latter must also stash functions that call immediate-escalating
> functions.  Consider:
>
>   

Re: [aarch64] PR111702 - ICE in insert_regs after interleave+zip1 vector initialization patch

2023-12-04 Thread Prathamesh Kulkarni
On Thu, 23 Nov 2023 at 17:06, Prathamesh Kulkarni
 wrote:
>
> Hi Richard,
> For the test-case mentioned in PR111702, compiling with -O2
> -frounding-math -fstack-protector-all results in following ICE during
> cse2 pass:
>
> test.c: In function 'foo':
> test.c:119:1: internal compiler error: in insert_regs, at cse.cc:1120
>   119 | }
>   | ^
> 0xb7ebb0 insert_regs
> ../../gcc/gcc/cse.cc:1120
> 0x1f95134 merge_equiv_classes
> ../../gcc/gcc/cse.cc:1764
> 0x1f9b9ab cse_insn
> ../../gcc/gcc/cse.cc:4793
> 0x1f9fe30 cse_extended_basic_block
> ../../gcc/gcc/cse.cc:6577
> 0x1f9fe30 cse_main
> ../../gcc/gcc/cse.cc:6722
> 0x1fa0984 rest_of_handle_cse2
> ../../gcc/gcc/cse.cc:7620
> 0x1fa0984 execute
> ../../gcc/gcc/cse.cc:7675
>
> This happens only with interleave+zip1 vector initialization with
> -frounding-math -fstack-protector-all, while it compiles OK without
> -fstack-protector-all. Also, it compiles OK with fallback sequence
> code-gen (with or without -fstack-protector-all). Unfortunately, I
> haven't been able to reduce the test-case further :/
>
> From the test-case, it seems only the vector initializer for type J
> uses interleave+zip1 approach, while rest of the vector initializers
> use fallback sequence.
>
> J is defined as:
> typedef _Float16 __attribute__((__vector_size__ (16))) J;
>
> and the initializer is:
> (J) { 11654, 4801, 5535, 9743, 61680}
>
> interleave+zip1 sequence for above initializer J:
> mode = V8HF
>
> vals: (parallel:V8HF [
> (reg:HF 642)
> (reg:HF 645)
> (reg:HF 648)
> (reg:HF 651)
> (reg:HF 654)
> (const_double:HF 0.0 [0x0.0p+0]) repeated x3
> ])
>
> target: (reg:V8HF 641)
> seq:
> (insn 1058 0 1059 (set (reg:V4HF 657)
> (const_vector:V4HF [
> (const_double:HF 0.0 [0x0.0p+0]) repeated x4
> ])) "test.c":81:8 -1
>  (nil))
> (insn 1059 1058 1060 (set (reg:V4HF 657)
> (vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 642))
> (reg:V4HF 657)
> (const_int 1 [0x1]))) "test.c":81:8 -1
>  (nil))
> (insn 1060 1059 1061 (set (reg:V4HF 657)
> (vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 648))
> (reg:V4HF 657)
> (const_int 2 [0x2]))) "test.c":81:8 -1
>  (nil))
> (insn 1061 1060 1062 (set (reg:V4HF 657)
> (vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 654))
> (reg:V4HF 657)
> (const_int 4 [0x4]))) "test.c":81:8 -1
>  (nil))
> (insn 1062 1061 1063 (set (reg:V4HF 658)
> (const_vector:V4HF [
> (const_double:HF 0.0 [0x0.0p+0]) repeated x4
> ])) "test.c":81:8 -1
>  (nil))
> (insn 1063 1062 1064 (set (reg:V4HF 658)
> (vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 645))
> (reg:V4HF 658)
> (const_int 1 [0x1]))) "test.c":81:8 -1
>  (nil))
> (insn 1064 1063 1065 (set (reg:V4HF 658)
> (vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 651))
> (reg:V4HF 658)
> (const_int 2 [0x2]))) "test.c":81:8 -1
>  (nil))
> (insn 1065 1064 0 (set (reg:V8HF 641)
> (unspec:V8HF [
> (subreg:V8HF (reg:V4HF 657) 0)
> (subreg:V8HF (reg:V4HF 658) 0)
> ] UNSPEC_ZIP1)) "test.c":81:8 -1
>  (nil))
>
> It seems to me that the above sequence correctly initializes the
> vector into r641 ?
> insns 1058-1061 construct r657 = { r642, r648, r654, 0 }
> insns 1062-1064 construct r658 = { r645, r651, 0, 0 }
> and zip1 will create r641 = { r642, r645, r648, r651, r654, 0, 0, 0 }
>
> For the above test, it seems that with interleave+zip1 approach and
> -fstack-protector-all,
> in cse pass, there are two separate equivalence classes created for
> (const_int 1), that need
> to be merged in cse_insn:
>
>if (elt->first_same_value != src_eqv_elt->first_same_value)
> {
>   /* The REG_EQUAL is indicating that two formerly distinct
>  classes are now equivalent.  So merge them.  */
>   merge_equiv_classes (elt, src_eqv_elt);
>
> elt equivalence chain:
> Equivalence chain for (subreg:QI (reg:V16QI 671) 0):
> (subreg:QI (reg:V16QI 671) 0)
> (const_int 1 [0x1])
>
> src_eqv_elt equivalence chain:
> Equivalence chain for (const_int 1 [0x1]):
> (reg:QI 34 v2)
> (reg:QI 32 v0)
> (reg:QI 34 v2)
> (const_int 1 [0x1])
> (vec_select:QI (reg:V16QI 671)
> (parallel [
> (const_int 1 [0x1])
> ]))
> (vec_selec

Re: PR111754

2023-11-27 Thread Prathamesh Kulkarni
On Fri, 24 Nov 2023 at 03:13, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Thu, 26 Oct 2023 at 09:43, Prathamesh Kulkarni
> >  wrote:
> >>
> >> On Thu, 26 Oct 2023 at 04:09, Richard Sandiford
> >>  wrote:
> >> >
> >> > Prathamesh Kulkarni  writes:
> >> > > On Wed, 25 Oct 2023 at 02:58, Richard Sandiford
> >> > >  wrote:
> >> > >> So I think the PR could be solved by something like the attached.
> >> > >> Do you agree?  If so, could you base the patch on this instead?
> >> > >>
> >> > >> Only tested against the self-tests.
> >> > >>
> >> > >> Thanks,
> >> > >> Richard
> >> > >>
> >> > >> diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> >> > >> index 40767736389..00fce4945a7 100644
> >> > >> --- a/gcc/fold-const.cc
> >> > >> +++ b/gcc/fold-const.cc
> >> > >> @@ -10743,27 +10743,37 @@ fold_vec_perm_cst (tree type, tree arg0, 
> >> > >> tree arg1, const vec_perm_indices ,
> >> > >>unsigned res_npatterns, res_nelts_per_pattern;
> >> > >>unsigned HOST_WIDE_INT res_nelts;
> >> > >>
> >> > >> -  /* (1) If SEL is a suitable mask as determined by
> >> > >> - valid_mask_for_fold_vec_perm_cst_p, then:
> >> > >> - res_npatterns = max of npatterns between ARG0, ARG1, and SEL
> >> > >> - res_nelts_per_pattern = max of nelts_per_pattern between
> >> > >> -ARG0, ARG1 and SEL.
> >> > >> - (2) If SEL is not a suitable mask, and TYPE is VLS then:
> >> > >> - res_npatterns = nelts in result vector.
> >> > >> - res_nelts_per_pattern = 1.
> >> > >> - This exception is made so that VLS ARG0, ARG1 and SEL work as 
> >> > >> before.  */
> >> > >> -  if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, reason))
> >> > >> -{
> >> > >> -  res_npatterns
> >> > >> -   = std::max (VECTOR_CST_NPATTERNS (arg0),
> >> > >> -   std::max (VECTOR_CST_NPATTERNS (arg1),
> >> > >> - sel.encoding ().npatterns ()));
> >> > >> +  /* First try to implement the fold in a VLA-friendly way.
> >> > >> +
> >> > >> + (1) If the selector is simply a duplication of N elements, the
> >> > >> +result is likewise a duplication of N elements.
> >> > >> +
> >> > >> + (2) If the selector is N elements followed by a duplication
> >> > >> +of N elements, the result is too.
> >> > >>
> >> > >> -  res_nelts_per_pattern
> >> > >> -   = std::max (VECTOR_CST_NELTS_PER_PATTERN (arg0),
> >> > >> -   std::max (VECTOR_CST_NELTS_PER_PATTERN (arg1),
> >> > >> - sel.encoding ().nelts_per_pattern ()));
> >> > >> + (3) If the selector is N elements followed by an interleaving
> >> > >> +of N linear series, the situation is more complex.
> >> > >>
> >> > >> +valid_mask_for_fold_vec_perm_cst_p detects whether we
> >> > >> +can handle this case.  If we can, then each of the N linear
> >> > >> +series either (a) selects the same element each time or
> >> > >> +(b) selects a linear series from one of the input patterns.
> >> > >> +
> >> > >> +If (b) holds for one of the linear series, the result
> >> > >> +will contain a linear series, and so the result will have
> >> > >> +the same shape as the selector.  If (a) holds for all of
> >> > >> +the lienar series, the result will be the same as (2) above.
> >> > >> +
> >> > >> +(b) can only hold if one of the inputs pattern has a
> >> > >> +stepped encoding.  */
> >> > >> +  if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, reason))
> >> > >> +{
> >> > >> +  res_npatterns = sel.encoding ().npatterns ();
> >> > >> +  res_nelts_per_pattern = sel.encoding ().nelts_per_pat

Re: PR111754

2023-11-23 Thread Prathamesh Kulkarni
On Wed, 15 Nov 2023 at 20:44, Prathamesh Kulkarni
 wrote:
>
> On Wed, 8 Nov 2023 at 21:57, Prathamesh Kulkarni
>  wrote:
> >
> > On Thu, 26 Oct 2023 at 09:43, Prathamesh Kulkarni
> >  wrote:
> > >
> > > On Thu, 26 Oct 2023 at 04:09, Richard Sandiford
> > >  wrote:
> > > >
> > > > Prathamesh Kulkarni  writes:
> > > > > On Wed, 25 Oct 2023 at 02:58, Richard Sandiford
> > > > >  wrote:
> > > > >>
> > > > >> Hi,
> > > > >>
> > > > >> Sorry the slow review.  I clearly didn't think this through properly
> > > > >> when doing the review of the original patch, so I wanted to spend
> > > > >> some time working on the code to get a better understanding of
> > > > >> the problem.
> > > > >>
> > > > >> Prathamesh Kulkarni  writes:
> > > > >> > Hi,
> > > > >> > For the following test-case:
> > > > >> >
> > > > >> > typedef float __attribute__((__vector_size__ (16))) F;
> > > > >> > F foo (F a, F b)
> > > > >> > {
> > > > >> >   F v = (F) { 9 };
> > > > >> >   return __builtin_shufflevector (v, v, 1, 0, 1, 2);
> > > > >> > }
> > > > >> >
> > > > >> > Compiling with -O2 results in following ICE:
> > > > >> > foo.c: In function ‘foo’:
> > > > >> > foo.c:6:10: internal compiler error: in decompose, at rtl.h:2314
> > > > >> > 6 |   return __builtin_shufflevector (v, v, 1, 0, 1, 2);
> > > > >> >   |  ^~
> > > > >> > 0x7f3185 wi::int_traits
> > > > >> >>::decompose(long*, unsigned int, std::pair
> > > > >> > const&)
> > > > >> > ../../gcc/gcc/rtl.h:2314
> > > > >> > 0x7f3185 wide_int_ref_storage > > > >> > false>::wide_int_ref_storage
> > > > >> >>(std::pair const&)
> > > > >> > ../../gcc/gcc/wide-int.h:1089
> > > > >> > 0x7f3185 generic_wide_int
> > > > >> >>::generic_wide_int
> > > > >> >>(std::pair const&)
> > > > >> > ../../gcc/gcc/wide-int.h:847
> > > > >> > 0x7f3185 poly_int<1u, generic_wide_int > > > >> > false> > >::poly_int
> > > > >> >>(poly_int_full, std::pair const&)
> > > > >> > ../../gcc/gcc/poly-int.h:467
> > > > >> > 0x7f3185 poly_int<1u, generic_wide_int > > > >> > false> > >::poly_int
> > > > >> >>(std::pair const&)
> > > > >> > ../../gcc/gcc/poly-int.h:453
> > > > >> > 0x7f3185 wi::to_poly_wide(rtx_def const*, machine_mode)
> > > > >> > ../../gcc/gcc/rtl.h:2383
> > > > >> > 0x7f3185 rtx_vector_builder::step(rtx_def*, rtx_def*) const
> > > > >> > ../../gcc/gcc/rtx-vector-builder.h:122
> > > > >> > 0xfd4e1b vector_builder > > > >> > rtx_vector_builder>::elt(unsigned int) const
> > > > >> > ../../gcc/gcc/vector-builder.h:253
> > > > >> > 0xfd4d11 rtx_vector_builder::build()
> > > > >> > ../../gcc/gcc/rtx-vector-builder.cc:73
> > > > >> > 0xc21d9c const_vector_from_tree
> > > > >> > ../../gcc/gcc/expr.cc:13487
> > > > >> > 0xc21d9c expand_expr_real_1(tree_node*, rtx_def*, machine_mode,
> > > > >> > expand_modifier, rtx_def**, bool)
> > > > >> > ../../gcc/gcc/expr.cc:11059
> > > > >> > 0xaee682 expand_expr(tree_node*, rtx_def*, machine_mode, 
> > > > >> > expand_modifier)
> > > > >> > ../../gcc/gcc/expr.h:310
> > > > >> > 0xaee682 expand_return
> > > > >> > ../../gcc/gcc/cfgexpand.cc:3809
> > > > >> > 0xaee682 expand_gimple_stmt_1
> > > > >> > ../../gcc/gcc/cfgexpand.cc:3918
> > > > >> > 0xaee682 expand_gimple_stmt
> > > > >> > ../../gcc/gcc/cfgexpand.cc:4044
> > > > >> > 0xaf28f0 expand_g

[aarch64] PR111702 - ICE in insert_regs after interleave+zip1 vector initialization patch

2023-11-23 Thread Prathamesh Kulkarni
Hi Richard,
For the test-case mentioned in PR111702, compiling with -O2
-frounding-math -fstack-protector-all results in following ICE during
cse2 pass:

test.c: In function 'foo':
test.c:119:1: internal compiler error: in insert_regs, at cse.cc:1120
  119 | }
  | ^
0xb7ebb0 insert_regs
../../gcc/gcc/cse.cc:1120
0x1f95134 merge_equiv_classes
../../gcc/gcc/cse.cc:1764
0x1f9b9ab cse_insn
../../gcc/gcc/cse.cc:4793
0x1f9fe30 cse_extended_basic_block
../../gcc/gcc/cse.cc:6577
0x1f9fe30 cse_main
../../gcc/gcc/cse.cc:6722
0x1fa0984 rest_of_handle_cse2
../../gcc/gcc/cse.cc:7620
0x1fa0984 execute
../../gcc/gcc/cse.cc:7675

This happens only with interleave+zip1 vector initialization with
-frounding-math -fstack-protector-all, while it compiles OK without
-fstack-protector-all. Also, it compiles OK with fallback sequence
code-gen (with or without -fstack-protector-all). Unfortunately, I
haven't been able to reduce the test-case further :/

>From the test-case, it seems only the vector initializer for type J
uses interleave+zip1 approach, while rest of the vector initializers
use fallback sequence.

J is defined as:
typedef _Float16 __attribute__((__vector_size__ (16))) J;

and the initializer is:
(J) { 11654, 4801, 5535, 9743, 61680}

interleave+zip1 sequence for above initializer J:
mode = V8HF

vals: (parallel:V8HF [
(reg:HF 642)
(reg:HF 645)
(reg:HF 648)
(reg:HF 651)
(reg:HF 654)
(const_double:HF 0.0 [0x0.0p+0]) repeated x3
])

target: (reg:V8HF 641)
seq:
(insn 1058 0 1059 (set (reg:V4HF 657)
(const_vector:V4HF [
(const_double:HF 0.0 [0x0.0p+0]) repeated x4
])) "test.c":81:8 -1
 (nil))
(insn 1059 1058 1060 (set (reg:V4HF 657)
(vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 642))
(reg:V4HF 657)
(const_int 1 [0x1]))) "test.c":81:8 -1
 (nil))
(insn 1060 1059 1061 (set (reg:V4HF 657)
(vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 648))
(reg:V4HF 657)
(const_int 2 [0x2]))) "test.c":81:8 -1
 (nil))
(insn 1061 1060 1062 (set (reg:V4HF 657)
(vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 654))
(reg:V4HF 657)
(const_int 4 [0x4]))) "test.c":81:8 -1
 (nil))
(insn 1062 1061 1063 (set (reg:V4HF 658)
(const_vector:V4HF [
(const_double:HF 0.0 [0x0.0p+0]) repeated x4
])) "test.c":81:8 -1
 (nil))
(insn 1063 1062 1064 (set (reg:V4HF 658)
(vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 645))
(reg:V4HF 658)
(const_int 1 [0x1]))) "test.c":81:8 -1
 (nil))
(insn 1064 1063 1065 (set (reg:V4HF 658)
(vec_merge:V4HF (vec_duplicate:V4HF (reg:HF 651))
(reg:V4HF 658)
(const_int 2 [0x2]))) "test.c":81:8 -1
 (nil))
(insn 1065 1064 0 (set (reg:V8HF 641)
(unspec:V8HF [
(subreg:V8HF (reg:V4HF 657) 0)
(subreg:V8HF (reg:V4HF 658) 0)
] UNSPEC_ZIP1)) "test.c":81:8 -1
 (nil))

It seems to me that the above sequence correctly initializes the
vector into r641 ?
insns 1058-1061 construct r657 = { r642, r648, r654, 0 }
insns 1062-1064 construct r658 = { r645, r651, 0, 0 }
and zip1 will create r641 = { r642, r645, r648, r651, r654, 0, 0, 0 }

For the above test, it seems that with interleave+zip1 approach and
-fstack-protector-all,
in cse pass, there are two separate equivalence classes created for
(const_int 1), that need
to be merged in cse_insn:

   if (elt->first_same_value != src_eqv_elt->first_same_value)
{
  /* The REG_EQUAL is indicating that two formerly distinct
 classes are now equivalent.  So merge them.  */
  merge_equiv_classes (elt, src_eqv_elt);

elt equivalence chain:
Equivalence chain for (subreg:QI (reg:V16QI 671) 0):
(subreg:QI (reg:V16QI 671) 0)
(const_int 1 [0x1])

src_eqv_elt equivalence chain:
Equivalence chain for (const_int 1 [0x1]):
(reg:QI 34 v2)
(reg:QI 32 v0)
(reg:QI 34 v2)
(const_int 1 [0x1])
(vec_select:QI (reg:V16QI 671)
(parallel [
(const_int 1 [0x1])
]))
(vec_select:QI (reg:V16QI 32 v0)
(parallel [
(const_int 1 [0x1])
]))
(vec_select:QI (reg:V16QI 33 v1)
(parallel [
(const_int 2 [0x2])
]))
(vec_select:QI (reg:V16QI 33 v1)
(parallel [
(const_int 1 [0x1])
]))

The issue is that merge_equiv_classes doesn't seem to deal correctly with
multiple occurences of same register in class2 (src_eqv_elt), which
has two occurrences of
(reg:QI 34 v2)

In merge_equiv_classes, on first iteration, it will remove (reg:QI 34)
from reg_equiv_table
by calling delete_equiv_reg(34), and in insert_regs it will create an
entry for (reg:QI 34) in qty_table with new quantity number, and
create new equivalence in reg_eqv_table.

When we again come across (reg:QI 34) in class2, it will

Re: PR111754

2023-11-15 Thread Prathamesh Kulkarni
On Wed, 8 Nov 2023 at 21:57, Prathamesh Kulkarni
 wrote:
>
> On Thu, 26 Oct 2023 at 09:43, Prathamesh Kulkarni
>  wrote:
> >
> > On Thu, 26 Oct 2023 at 04:09, Richard Sandiford
> >  wrote:
> > >
> > > Prathamesh Kulkarni  writes:
> > > > On Wed, 25 Oct 2023 at 02:58, Richard Sandiford
> > > >  wrote:
> > > >>
> > > >> Hi,
> > > >>
> > > >> Sorry the slow review.  I clearly didn't think this through properly
> > > >> when doing the review of the original patch, so I wanted to spend
> > > >> some time working on the code to get a better understanding of
> > > >> the problem.
> > > >>
> > > >> Prathamesh Kulkarni  writes:
> > > >> > Hi,
> > > >> > For the following test-case:
> > > >> >
> > > >> > typedef float __attribute__((__vector_size__ (16))) F;
> > > >> > F foo (F a, F b)
> > > >> > {
> > > >> >   F v = (F) { 9 };
> > > >> >   return __builtin_shufflevector (v, v, 1, 0, 1, 2);
> > > >> > }
> > > >> >
> > > >> > Compiling with -O2 results in following ICE:
> > > >> > foo.c: In function ‘foo’:
> > > >> > foo.c:6:10: internal compiler error: in decompose, at rtl.h:2314
> > > >> > 6 |   return __builtin_shufflevector (v, v, 1, 0, 1, 2);
> > > >> >   |  ^~
> > > >> > 0x7f3185 wi::int_traits
> > > >> >>::decompose(long*, unsigned int, std::pair
> > > >> > const&)
> > > >> > ../../gcc/gcc/rtl.h:2314
> > > >> > 0x7f3185 wide_int_ref_storage > > >> > false>::wide_int_ref_storage
> > > >> >>(std::pair const&)
> > > >> > ../../gcc/gcc/wide-int.h:1089
> > > >> > 0x7f3185 generic_wide_int
> > > >> >>::generic_wide_int
> > > >> >>(std::pair const&)
> > > >> > ../../gcc/gcc/wide-int.h:847
> > > >> > 0x7f3185 poly_int<1u, generic_wide_int > > >> > false> > >::poly_int
> > > >> >>(poly_int_full, std::pair const&)
> > > >> > ../../gcc/gcc/poly-int.h:467
> > > >> > 0x7f3185 poly_int<1u, generic_wide_int > > >> > false> > >::poly_int
> > > >> >>(std::pair const&)
> > > >> > ../../gcc/gcc/poly-int.h:453
> > > >> > 0x7f3185 wi::to_poly_wide(rtx_def const*, machine_mode)
> > > >> > ../../gcc/gcc/rtl.h:2383
> > > >> > 0x7f3185 rtx_vector_builder::step(rtx_def*, rtx_def*) const
> > > >> > ../../gcc/gcc/rtx-vector-builder.h:122
> > > >> > 0xfd4e1b vector_builder > > >> > rtx_vector_builder>::elt(unsigned int) const
> > > >> > ../../gcc/gcc/vector-builder.h:253
> > > >> > 0xfd4d11 rtx_vector_builder::build()
> > > >> > ../../gcc/gcc/rtx-vector-builder.cc:73
> > > >> > 0xc21d9c const_vector_from_tree
> > > >> > ../../gcc/gcc/expr.cc:13487
> > > >> > 0xc21d9c expand_expr_real_1(tree_node*, rtx_def*, machine_mode,
> > > >> > expand_modifier, rtx_def**, bool)
> > > >> > ../../gcc/gcc/expr.cc:11059
> > > >> > 0xaee682 expand_expr(tree_node*, rtx_def*, machine_mode, 
> > > >> > expand_modifier)
> > > >> > ../../gcc/gcc/expr.h:310
> > > >> > 0xaee682 expand_return
> > > >> > ../../gcc/gcc/cfgexpand.cc:3809
> > > >> > 0xaee682 expand_gimple_stmt_1
> > > >> > ../../gcc/gcc/cfgexpand.cc:3918
> > > >> > 0xaee682 expand_gimple_stmt
> > > >> > ../../gcc/gcc/cfgexpand.cc:4044
> > > >> > 0xaf28f0 expand_gimple_basic_block
> > > >> > ../../gcc/gcc/cfgexpand.cc:6100
> > > >> > 0xaf4996 execute
> > > >> > ../../gcc/gcc/cfgexpand.cc:6835
> > > >> >
> > > >> > IIUC, the issue is that fold_vec_perm returns a vector having float 
> > > >> > element
> > > >> > type with res_nelts_per_pattern == 3, and later ICE's when it 

Re: [PATCH v3 2/2]middle-end match.pd: optimize fneg (fabs (x)) to copysign (x, -1) [PR109154]

2023-11-10 Thread Prathamesh Kulkarni
On Mon, 6 Nov 2023 at 15:50, Tamar Christina  wrote:
>
> Hi All,
>
> This patch transforms fneg (fabs (x)) into copysign (x, -1) which is more
> canonical and allows a target to expand this sequence efficiently.  Such
> sequences are common in scientific code working with gradients.
>
> There is an existing canonicalization of copysign (x, -1) to fneg (fabs (x))
> which I remove since this is a less efficient form.  The testsuite is also
> updated in light of this.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Hi Tamar,
It seems the patch caused following regressions on arm:

Running gcc:gcc.dg/dg.exp ...
FAIL: gcc.dg/pr55152-2.c scan-tree-dump-times optimized ".COPYSIGN" 1
FAIL: gcc.dg/pr55152-2.c scan-tree-dump-times optimized "ABS_EXPR" 1

Running gcc:gcc.dg/tree-ssa/tree-ssa.exp ...
FAIL: gcc.dg/tree-ssa/abs-4.c scan-tree-dump-times optimized "= -" 1
FAIL: gcc.dg/tree-ssa/abs-4.c scan-tree-dump-times optimized "= .COPYSIGN" 2
FAIL: gcc.dg/tree-ssa/abs-4.c scan-tree-dump-times optimized "= ABS_EXPR" 1
FAIL: gcc.dg/tree-ssa/backprop-6.c scan-tree-dump-times backprop
"Deleting[^\\n]* = -" 4
FAIL: gcc.dg/tree-ssa/backprop-6.c scan-tree-dump-times backprop
"Deleting[^\\n]* = ABS_EXPR <" 1
FAIL: gcc.dg/tree-ssa/backprop-6.c scan-tree-dump-times backprop
"Deleting[^\\n]* = \\.COPYSIGN" 2
FAIL: gcc.dg/tree-ssa/copy-sign-2.c scan-tree-dump-times optimized ".COPYSIGN" 1
FAIL: gcc.dg/tree-ssa/copy-sign-2.c scan-tree-dump-times optimized "ABS" 1
FAIL: gcc.dg/tree-ssa/mult-abs-2.c scan-tree-dump-times gimple ".COPYSIGN" 4
FAIL: gcc.dg/tree-ssa/mult-abs-2.c scan-tree-dump-times gimple "ABS" 4
FAIL: gcc.dg/tree-ssa/phi-opt-24.c scan-tree-dump-not phiopt2 "if"
Link to log files:
https://ci.linaro.org/job/tcwg_gcc_check--master-arm-build/1240/artifact/artifacts/00-sumfiles/

Even for following test-case:
double g (double a)
{
  double t1 = fabs (a);
  double t2 = -t1;
  return t2;
}

It seems, the pattern gets applied but doesn't get eventually
simplified to copysign(a, -1).
forwprop dump shows:
Applying pattern match.pd:1131, gimple-match-4.cc:4134
double g (double a)
{
  double t2;
  double t1;

   :
  t1_2 = ABS_EXPR ;
  t2_3 = -t1_2;
  return t2_3;

}

while on x86_64:
Applying pattern match.pd:1131, gimple-match-4.cc:4134
gimple_simplified to t2_3 = .COPYSIGN (a_1(D), -1.0e+0);
Removing dead stmt:t1_2 = ABS_EXPR ;
double g (double a)
{
  double t2;
  double t1;

   :
  t2_3 = .COPYSIGN (a_1(D), -1.0e+0);
  return t2_3;

}

Thanks,
Prathamesh


>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> PR tree-optimization/109154
> * match.pd: Add new neg+abs rule, remove inverse copysign rule.
>
> gcc/testsuite/ChangeLog:
>
> PR tree-optimization/109154
> * gcc.dg/fold-copysign-1.c: Updated.
> * gcc.dg/pr55152-2.c: Updated.
> * gcc.dg/tree-ssa/abs-4.c: Updated.
> * gcc.dg/tree-ssa/backprop-6.c: Updated.
> * gcc.dg/tree-ssa/copy-sign-2.c: Updated.
> * gcc.dg/tree-ssa/mult-abs-2.c: Updated.
> * gcc.target/aarch64/fneg-abs_1.c: New test.
> * gcc.target/aarch64/fneg-abs_2.c: New test.
> * gcc.target/aarch64/fneg-abs_3.c: New test.
> * gcc.target/aarch64/fneg-abs_4.c: New test.
> * gcc.target/aarch64/sve/fneg-abs_1.c: New test.
> * gcc.target/aarch64/sve/fneg-abs_2.c: New test.
> * gcc.target/aarch64/sve/fneg-abs_3.c: New test.
> * gcc.target/aarch64/sve/fneg-abs_4.c: New test.
>
> --- inline copy of patch --
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 
> db95931df0672cf4ef08cca36085c3aa6831519e..7a023d510c283c43a87b1795a74761b8af979b53
>  100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -1106,13 +1106,11 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> (hypots @0 (copysigns @1 @2))
> (hypots @0 @1
>
> -/* copysign(x, CST) -> [-]abs (x).  */
> -(for copysigns (COPYSIGN_ALL)
> - (simplify
> -  (copysigns @0 REAL_CST@1)
> -  (if (REAL_VALUE_NEGATIVE (TREE_REAL_CST (@1)))
> -   (negate (abs @0))
> -   (abs @0
> +/* Transform fneg (fabs (X)) -> copysign (X, -1).  */
> +
> +(simplify
> + (negate (abs @0))
> + (IFN_COPYSIGN @0 { build_minus_one_cst (type); }))
>
>  /* copysign(copysign(x, y), z) -> copysign(x, z).  */
>  (for copysigns (COPYSIGN_ALL)
> diff --git a/gcc/testsuite/gcc.dg/fold-copysign-1.c 
> b/gcc/testsuite/gcc.dg/fold-copysign-1.c
> index 
> f17d65c24ee4dca9867827d040fe0a404c515e7b..f9cafd14ab05f5e8ab2f6f68e62801d21c2df6a6
>  100644
> --- a/gcc/testsuite/gcc.dg/fold-copysign-1.c
> +++ b/gcc/testsuite/gcc.dg/fold-copysign-1.c
> @@ -12,5 +12,5 @@ double bar (double x)
>return __builtin_copysign (x, minuszero);
>  }
>
> -/* { dg-final { scan-tree-dump-times "= -" 1 "cddce1" } } */
> -/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 2 "cddce1" } } */
> +/* { dg-final { scan-tree-dump-times "__builtin_copysign" 1 "cddce1" } } */
> +/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 1 "cddce1" } } */
> diff --git 

Re: [PATCH v3] libiberty: Use posix_spawn in pex-unix when available.

2023-11-10 Thread Prathamesh Kulkarni
On Thu, 5 Oct 2023 at 00:00, Brendan Shanks  wrote:
>
> Hi,
>
> This patch implements pex_unix_exec_child using posix_spawn when
> available.
>
> This should especially benefit recent macOS (where vfork just calls
> fork), but should have equivalent or faster performance on all
> platforms.
> In addition, the implementation is substantially simpler than the
> vfork+exec code path.
>
> Tested on x86_64-linux.
Hi Brendan,
It seems this patch caused the following regressions on aarch64:

FAIL: g++.dg/modules/bad-mapper-1.C -std=c++17  at line 3 (test for
errors, line )
FAIL: g++.dg/modules/bad-mapper-1.C -std=c++17 (test for excess errors)
FAIL: g++.dg/modules/bad-mapper-1.C -std=c++2a  at line 3 (test for
errors, line )
FAIL: g++.dg/modules/bad-mapper-1.C -std=c++2a (test for excess errors)
FAIL: g++.dg/modules/bad-mapper-1.C -std=c++2b  at line 3 (test for
errors, line )
FAIL: g++.dg/modules/bad-mapper-1.C -std=c++2b (test for excess errors)

Looking at g++.log:
/home/tcwg-buildslave/workspace/tcwg_gnu_2/abe/snapshots/gcc.git~master/gcc/testsuite/g++.dg/modules/bad-mapper-1.C:
error: failed posix_spawnp mapper 'this-will-not-work'
In module imported at
/home/tcwg-buildslave/workspace/tcwg_gnu_2/abe/snapshots/gcc.git~master/gcc/testsuite/g++.dg/modules/bad-mapper-1.C:2:1:
unique1.bob: error: failed to read compiled module: No such file or directory
unique1.bob: note: compiled module file is 'gcm.cache/unique1.bob.gcm'
unique1.bob: note: imports must be built before being imported
unique1.bob: fatal error: returning to the gate for a mechanical issue
compilation terminated.

Link to log files:
https://ci.linaro.org/job/tcwg_gcc_check--master-aarch64-build/1159/artifact/artifacts/00-sumfiles/
Could you please investigate ?

Thanks,
Prathamesh
>
> v2: Fix error handling (previously the function would be run twice in
> case of error), and don't use a macro that changes control flow.
>
> v3: Match file style for error-handling blocks, don't close
> in/out/errdes on error, and check close() for errors.
>
> libiberty/
> * configure.ac (AC_CHECK_HEADERS): Add spawn.h.
> (checkfuncs): Add posix_spawn, posix_spawnp.
> (AC_CHECK_FUNCS): Add posix_spawn, posix_spawnp.
> * configure, config.in: Rebuild.
> * pex-unix.c [HAVE_POSIX_SPAWN] (pex_unix_exec_child): New function.
>
> Signed-off-by: Brendan Shanks 
> ---
>  libiberty/configure.ac |   8 +-
>  libiberty/pex-unix.c   | 168 +
>  2 files changed, 173 insertions(+), 3 deletions(-)
>
> diff --git a/libiberty/configure.ac b/libiberty/configure.ac
> index 0748c592704..2488b031bc8 100644
> --- a/libiberty/configure.ac
> +++ b/libiberty/configure.ac
> @@ -289,7 +289,7 @@ AC_SUBST_FILE(host_makefile_frag)
>  # It's OK to check for header files.  Although the compiler may not be
>  # able to link anything, it had better be able to at least compile
>  # something.
> -AC_CHECK_HEADERS(sys/file.h sys/param.h limits.h stdlib.h malloc.h string.h 
> unistd.h strings.h sys/time.h time.h sys/resource.h sys/stat.h sys/mman.h 
> fcntl.h alloca.h sys/pstat.h sys/sysmp.h sys/sysinfo.h machine/hal_sysinfo.h 
> sys/table.h sys/sysctl.h sys/systemcfg.h stdint.h stdio_ext.h process.h 
> sys/prctl.h)
> +AC_CHECK_HEADERS(sys/file.h sys/param.h limits.h stdlib.h malloc.h string.h 
> unistd.h strings.h sys/time.h time.h sys/resource.h sys/stat.h sys/mman.h 
> fcntl.h alloca.h sys/pstat.h sys/sysmp.h sys/sysinfo.h machine/hal_sysinfo.h 
> sys/table.h sys/sysctl.h sys/systemcfg.h stdint.h stdio_ext.h process.h 
> sys/prctl.h spawn.h)
>  AC_HEADER_SYS_WAIT
>  AC_HEADER_TIME
>
> @@ -412,7 +412,8 @@ funcs="$funcs setproctitle"
>  vars="sys_errlist sys_nerr sys_siglist"
>
>  checkfuncs="__fsetlocking canonicalize_file_name dup3 getrlimit getrusage \
> - getsysinfo gettimeofday on_exit pipe2 psignal pstat_getdynamic 
> pstat_getstatic \
> + getsysinfo gettimeofday on_exit pipe2 posix_spawn posix_spawnp psignal \
> + pstat_getdynamic pstat_getstatic \
>   realpath setrlimit spawnve spawnvpe strerror strsignal sysconf sysctl \
>   sysmp table times wait3 wait4"
>
> @@ -435,7 +436,8 @@ if test "x" = "y"; then
>  index insque \
>  memchr memcmp memcpy memmem memmove memset mkstemps \
>  on_exit \
> -pipe2 psignal pstat_getdynamic pstat_getstatic putenv \
> +pipe2 posix_spawn posix_spawnp psignal \
> +pstat_getdynamic pstat_getstatic putenv \
>  random realpath rename rindex \
>  sbrk setenv setproctitle setrlimit sigsetmask snprintf spawnve spawnvpe \
>   stpcpy stpncpy strcasecmp strchr strdup \
> diff --git a/libiberty/pex-unix.c b/libiberty/pex-unix.c
> index 33b5bce31c2..336799d1125 100644
> --- a/libiberty/pex-unix.c
> +++ b/libiberty/pex-unix.c
> @@ -58,6 +58,9 @@ extern int errno;
>  #ifdef HAVE_PROCESS_H
>  #include 
>  #endif
> +#ifdef HAVE_SPAWN_H
> +#include 
> +#endif
>
>  #ifdef vfork /* Autoconf may define this to fork for us. */
>  # define VFORK_STRING "fork"
> @@ -559,6 

Re: PR111754

2023-11-08 Thread Prathamesh Kulkarni
On Thu, 26 Oct 2023 at 09:43, Prathamesh Kulkarni
 wrote:
>
> On Thu, 26 Oct 2023 at 04:09, Richard Sandiford
>  wrote:
> >
> > Prathamesh Kulkarni  writes:
> > > On Wed, 25 Oct 2023 at 02:58, Richard Sandiford
> > >  wrote:
> > >>
> > >> Hi,
> > >>
> > >> Sorry the slow review.  I clearly didn't think this through properly
> > >> when doing the review of the original patch, so I wanted to spend
> > >> some time working on the code to get a better understanding of
> > >> the problem.
> > >>
> > >> Prathamesh Kulkarni  writes:
> > >> > Hi,
> > >> > For the following test-case:
> > >> >
> > >> > typedef float __attribute__((__vector_size__ (16))) F;
> > >> > F foo (F a, F b)
> > >> > {
> > >> >   F v = (F) { 9 };
> > >> >   return __builtin_shufflevector (v, v, 1, 0, 1, 2);
> > >> > }
> > >> >
> > >> > Compiling with -O2 results in following ICE:
> > >> > foo.c: In function ‘foo’:
> > >> > foo.c:6:10: internal compiler error: in decompose, at rtl.h:2314
> > >> > 6 |   return __builtin_shufflevector (v, v, 1, 0, 1, 2);
> > >> >   |  ^~
> > >> > 0x7f3185 wi::int_traits
> > >> >>::decompose(long*, unsigned int, std::pair
> > >> > const&)
> > >> > ../../gcc/gcc/rtl.h:2314
> > >> > 0x7f3185 wide_int_ref_storage > >> > false>::wide_int_ref_storage
> > >> >>(std::pair const&)
> > >> > ../../gcc/gcc/wide-int.h:1089
> > >> > 0x7f3185 generic_wide_int
> > >> >>::generic_wide_int
> > >> >>(std::pair const&)
> > >> > ../../gcc/gcc/wide-int.h:847
> > >> > 0x7f3185 poly_int<1u, generic_wide_int > >> > false> > >::poly_int
> > >> >>(poly_int_full, std::pair const&)
> > >> > ../../gcc/gcc/poly-int.h:467
> > >> > 0x7f3185 poly_int<1u, generic_wide_int > >> > false> > >::poly_int
> > >> >>(std::pair const&)
> > >> > ../../gcc/gcc/poly-int.h:453
> > >> > 0x7f3185 wi::to_poly_wide(rtx_def const*, machine_mode)
> > >> > ../../gcc/gcc/rtl.h:2383
> > >> > 0x7f3185 rtx_vector_builder::step(rtx_def*, rtx_def*) const
> > >> > ../../gcc/gcc/rtx-vector-builder.h:122
> > >> > 0xfd4e1b vector_builder > >> > rtx_vector_builder>::elt(unsigned int) const
> > >> > ../../gcc/gcc/vector-builder.h:253
> > >> > 0xfd4d11 rtx_vector_builder::build()
> > >> > ../../gcc/gcc/rtx-vector-builder.cc:73
> > >> > 0xc21d9c const_vector_from_tree
> > >> > ../../gcc/gcc/expr.cc:13487
> > >> > 0xc21d9c expand_expr_real_1(tree_node*, rtx_def*, machine_mode,
> > >> > expand_modifier, rtx_def**, bool)
> > >> > ../../gcc/gcc/expr.cc:11059
> > >> > 0xaee682 expand_expr(tree_node*, rtx_def*, machine_mode, 
> > >> > expand_modifier)
> > >> > ../../gcc/gcc/expr.h:310
> > >> > 0xaee682 expand_return
> > >> > ../../gcc/gcc/cfgexpand.cc:3809
> > >> > 0xaee682 expand_gimple_stmt_1
> > >> > ../../gcc/gcc/cfgexpand.cc:3918
> > >> > 0xaee682 expand_gimple_stmt
> > >> > ../../gcc/gcc/cfgexpand.cc:4044
> > >> > 0xaf28f0 expand_gimple_basic_block
> > >> > ../../gcc/gcc/cfgexpand.cc:6100
> > >> > 0xaf4996 execute
> > >> > ../../gcc/gcc/cfgexpand.cc:6835
> > >> >
> > >> > IIUC, the issue is that fold_vec_perm returns a vector having float 
> > >> > element
> > >> > type with res_nelts_per_pattern == 3, and later ICE's when it tries
> > >> > to derive element v[3], not present in the encoding, while trying to
> > >> > build rtx vector
> > >> > in rtx_vector_builder::build():
> > >> >  for (unsigned int i = 0; i < nelts; ++i)
> > >> > RTVEC_ELT (v, i) = elt (i);
> > >> >
> > >> > The attached patch tries to fix this by returning false from
> > >> > valid_mask_for_fold_vec_perm_cst if sel has a 

Re: PR111754

2023-10-25 Thread Prathamesh Kulkarni
On Thu, 26 Oct 2023 at 04:09, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Wed, 25 Oct 2023 at 02:58, Richard Sandiford
> >  wrote:
> >>
> >> Hi,
> >>
> >> Sorry the slow review.  I clearly didn't think this through properly
> >> when doing the review of the original patch, so I wanted to spend
> >> some time working on the code to get a better understanding of
> >> the problem.
> >>
> >> Prathamesh Kulkarni  writes:
> >> > Hi,
> >> > For the following test-case:
> >> >
> >> > typedef float __attribute__((__vector_size__ (16))) F;
> >> > F foo (F a, F b)
> >> > {
> >> >   F v = (F) { 9 };
> >> >   return __builtin_shufflevector (v, v, 1, 0, 1, 2);
> >> > }
> >> >
> >> > Compiling with -O2 results in following ICE:
> >> > foo.c: In function ‘foo’:
> >> > foo.c:6:10: internal compiler error: in decompose, at rtl.h:2314
> >> > 6 |   return __builtin_shufflevector (v, v, 1, 0, 1, 2);
> >> >   |  ^~
> >> > 0x7f3185 wi::int_traits
> >> >>::decompose(long*, unsigned int, std::pair
> >> > const&)
> >> > ../../gcc/gcc/rtl.h:2314
> >> > 0x7f3185 wide_int_ref_storage >> > false>::wide_int_ref_storage
> >> >>(std::pair const&)
> >> > ../../gcc/gcc/wide-int.h:1089
> >> > 0x7f3185 generic_wide_int
> >> >>::generic_wide_int
> >> >>(std::pair const&)
> >> > ../../gcc/gcc/wide-int.h:847
> >> > 0x7f3185 poly_int<1u, generic_wide_int >> > false> > >::poly_int
> >> >>(poly_int_full, std::pair const&)
> >> > ../../gcc/gcc/poly-int.h:467
> >> > 0x7f3185 poly_int<1u, generic_wide_int >> > false> > >::poly_int
> >> >>(std::pair const&)
> >> > ../../gcc/gcc/poly-int.h:453
> >> > 0x7f3185 wi::to_poly_wide(rtx_def const*, machine_mode)
> >> > ../../gcc/gcc/rtl.h:2383
> >> > 0x7f3185 rtx_vector_builder::step(rtx_def*, rtx_def*) const
> >> > ../../gcc/gcc/rtx-vector-builder.h:122
> >> > 0xfd4e1b vector_builder >> > rtx_vector_builder>::elt(unsigned int) const
> >> > ../../gcc/gcc/vector-builder.h:253
> >> > 0xfd4d11 rtx_vector_builder::build()
> >> > ../../gcc/gcc/rtx-vector-builder.cc:73
> >> > 0xc21d9c const_vector_from_tree
> >> > ../../gcc/gcc/expr.cc:13487
> >> > 0xc21d9c expand_expr_real_1(tree_node*, rtx_def*, machine_mode,
> >> > expand_modifier, rtx_def**, bool)
> >> > ../../gcc/gcc/expr.cc:11059
> >> > 0xaee682 expand_expr(tree_node*, rtx_def*, machine_mode, expand_modifier)
> >> > ../../gcc/gcc/expr.h:310
> >> > 0xaee682 expand_return
> >> > ../../gcc/gcc/cfgexpand.cc:3809
> >> > 0xaee682 expand_gimple_stmt_1
> >> > ../../gcc/gcc/cfgexpand.cc:3918
> >> > 0xaee682 expand_gimple_stmt
> >> > ../../gcc/gcc/cfgexpand.cc:4044
> >> > 0xaf28f0 expand_gimple_basic_block
> >> > ../../gcc/gcc/cfgexpand.cc:6100
> >> > 0xaf4996 execute
> >> > ../../gcc/gcc/cfgexpand.cc:6835
> >> >
> >> > IIUC, the issue is that fold_vec_perm returns a vector having float 
> >> > element
> >> > type with res_nelts_per_pattern == 3, and later ICE's when it tries
> >> > to derive element v[3], not present in the encoding, while trying to
> >> > build rtx vector
> >> > in rtx_vector_builder::build():
> >> >  for (unsigned int i = 0; i < nelts; ++i)
> >> > RTVEC_ELT (v, i) = elt (i);
> >> >
> >> > The attached patch tries to fix this by returning false from
> >> > valid_mask_for_fold_vec_perm_cst if sel has a stepped sequence and
> >> > input vector has non-integral element type, so for VLA vectors, it
> >> > will only build result with dup sequence (nelts_per_pattern < 3) for
> >> > non-integral element type.
> >> >
> >> > For VLS vectors, this will still work for stepped sequence since it
> >> > will then use the "VLS exception" in fold_vec_perm_cst, and set:
> >> > res_npattern = res_nelts and
> >> > res_nelt

Re: PR111754

2023-10-25 Thread Prathamesh Kulkarni
On Wed, 25 Oct 2023 at 02:58, Richard Sandiford
 wrote:
>
> Hi,
>
> Sorry the slow review.  I clearly didn't think this through properly
> when doing the review of the original patch, so I wanted to spend
> some time working on the code to get a better understanding of
> the problem.
>
> Prathamesh Kulkarni  writes:
> > Hi,
> > For the following test-case:
> >
> > typedef float __attribute__((__vector_size__ (16))) F;
> > F foo (F a, F b)
> > {
> >   F v = (F) { 9 };
> >   return __builtin_shufflevector (v, v, 1, 0, 1, 2);
> > }
> >
> > Compiling with -O2 results in following ICE:
> > foo.c: In function ‘foo’:
> > foo.c:6:10: internal compiler error: in decompose, at rtl.h:2314
> > 6 |   return __builtin_shufflevector (v, v, 1, 0, 1, 2);
> >   |  ^~
> > 0x7f3185 wi::int_traits
> >>::decompose(long*, unsigned int, std::pair
> > const&)
> > ../../gcc/gcc/rtl.h:2314
> > 0x7f3185 wide_int_ref_storage > false>::wide_int_ref_storage
> >>(std::pair const&)
> > ../../gcc/gcc/wide-int.h:1089
> > 0x7f3185 generic_wide_int
> >>::generic_wide_int
> >>(std::pair const&)
> > ../../gcc/gcc/wide-int.h:847
> > 0x7f3185 poly_int<1u, generic_wide_int > false> > >::poly_int
> >>(poly_int_full, std::pair const&)
> > ../../gcc/gcc/poly-int.h:467
> > 0x7f3185 poly_int<1u, generic_wide_int > false> > >::poly_int
> >>(std::pair const&)
> > ../../gcc/gcc/poly-int.h:453
> > 0x7f3185 wi::to_poly_wide(rtx_def const*, machine_mode)
> > ../../gcc/gcc/rtl.h:2383
> > 0x7f3185 rtx_vector_builder::step(rtx_def*, rtx_def*) const
> > ../../gcc/gcc/rtx-vector-builder.h:122
> > 0xfd4e1b vector_builder > rtx_vector_builder>::elt(unsigned int) const
> > ../../gcc/gcc/vector-builder.h:253
> > 0xfd4d11 rtx_vector_builder::build()
> > ../../gcc/gcc/rtx-vector-builder.cc:73
> > 0xc21d9c const_vector_from_tree
> > ../../gcc/gcc/expr.cc:13487
> > 0xc21d9c expand_expr_real_1(tree_node*, rtx_def*, machine_mode,
> > expand_modifier, rtx_def**, bool)
> > ../../gcc/gcc/expr.cc:11059
> > 0xaee682 expand_expr(tree_node*, rtx_def*, machine_mode, expand_modifier)
> > ../../gcc/gcc/expr.h:310
> > 0xaee682 expand_return
> > ../../gcc/gcc/cfgexpand.cc:3809
> > 0xaee682 expand_gimple_stmt_1
> > ../../gcc/gcc/cfgexpand.cc:3918
> > 0xaee682 expand_gimple_stmt
> > ../../gcc/gcc/cfgexpand.cc:4044
> > 0xaf28f0 expand_gimple_basic_block
> > ../../gcc/gcc/cfgexpand.cc:6100
> > 0xaf4996 execute
> > ../../gcc/gcc/cfgexpand.cc:6835
> >
> > IIUC, the issue is that fold_vec_perm returns a vector having float element
> > type with res_nelts_per_pattern == 3, and later ICE's when it tries
> > to derive element v[3], not present in the encoding, while trying to
> > build rtx vector
> > in rtx_vector_builder::build():
> >  for (unsigned int i = 0; i < nelts; ++i)
> > RTVEC_ELT (v, i) = elt (i);
> >
> > The attached patch tries to fix this by returning false from
> > valid_mask_for_fold_vec_perm_cst if sel has a stepped sequence and
> > input vector has non-integral element type, so for VLA vectors, it
> > will only build result with dup sequence (nelts_per_pattern < 3) for
> > non-integral element type.
> >
> > For VLS vectors, this will still work for stepped sequence since it
> > will then use the "VLS exception" in fold_vec_perm_cst, and set:
> > res_npattern = res_nelts and
> > res_nelts_per_pattern = 1
> >
> > and fold the above case to:
> > F foo (F a, F b)
> > {
> >[local count: 1073741824]:
> >   return { 0.0, 9.0e+0, 0.0, 0.0 };
> > }
> >
> > But I am not sure if this is entirely correct, since:
> > tree res = out_elts.build ();
> > will canonicalize the encoding and may result in a stepped sequence
> > (vector_builder::finalize() may reduce npatterns at the cost of increasing
> > nelts_per_pattern)  ?
> >
> > PS: This issue is now latent after PR111648 fix, since
> > valid_mask_for_fold_vec_perm_cst with  sel = {1, 0, 1, ...} returns
> > false because the corresponding pattern in arg0 is not a natural
> > stepped sequence, and folds correctly using VLS exception. However, I
> > guess the underlying issue of dealing with non-integral element types
> > in fold_vec_perm_cst still remain

PR111754

2023-10-20 Thread Prathamesh Kulkarni
Hi,
For the following test-case:

typedef float __attribute__((__vector_size__ (16))) F;
F foo (F a, F b)
{
  F v = (F) { 9 };
  return __builtin_shufflevector (v, v, 1, 0, 1, 2);
}

Compiling with -O2 results in following ICE:
foo.c: In function ‘foo’:
foo.c:6:10: internal compiler error: in decompose, at rtl.h:2314
6 |   return __builtin_shufflevector (v, v, 1, 0, 1, 2);
  |  ^~
0x7f3185 wi::int_traits
>::decompose(long*, unsigned int, std::pair
const&)
../../gcc/gcc/rtl.h:2314
0x7f3185 wide_int_ref_storage::wide_int_ref_storage
>(std::pair const&)
../../gcc/gcc/wide-int.h:1089
0x7f3185 generic_wide_int
>::generic_wide_int
>(std::pair const&)
../../gcc/gcc/wide-int.h:847
0x7f3185 poly_int<1u, generic_wide_int > >::poly_int
>(poly_int_full, std::pair const&)
../../gcc/gcc/poly-int.h:467
0x7f3185 poly_int<1u, generic_wide_int > >::poly_int
>(std::pair const&)
../../gcc/gcc/poly-int.h:453
0x7f3185 wi::to_poly_wide(rtx_def const*, machine_mode)
../../gcc/gcc/rtl.h:2383
0x7f3185 rtx_vector_builder::step(rtx_def*, rtx_def*) const
../../gcc/gcc/rtx-vector-builder.h:122
0xfd4e1b vector_builder::elt(unsigned int) const
../../gcc/gcc/vector-builder.h:253
0xfd4d11 rtx_vector_builder::build()
../../gcc/gcc/rtx-vector-builder.cc:73
0xc21d9c const_vector_from_tree
../../gcc/gcc/expr.cc:13487
0xc21d9c expand_expr_real_1(tree_node*, rtx_def*, machine_mode,
expand_modifier, rtx_def**, bool)
../../gcc/gcc/expr.cc:11059
0xaee682 expand_expr(tree_node*, rtx_def*, machine_mode, expand_modifier)
../../gcc/gcc/expr.h:310
0xaee682 expand_return
../../gcc/gcc/cfgexpand.cc:3809
0xaee682 expand_gimple_stmt_1
../../gcc/gcc/cfgexpand.cc:3918
0xaee682 expand_gimple_stmt
../../gcc/gcc/cfgexpand.cc:4044
0xaf28f0 expand_gimple_basic_block
../../gcc/gcc/cfgexpand.cc:6100
0xaf4996 execute
../../gcc/gcc/cfgexpand.cc:6835

IIUC, the issue is that fold_vec_perm returns a vector having float element
type with res_nelts_per_pattern == 3, and later ICE's when it tries
to derive element v[3], not present in the encoding, while trying to
build rtx vector
in rtx_vector_builder::build():
 for (unsigned int i = 0; i < nelts; ++i)
RTVEC_ELT (v, i) = elt (i);

The attached patch tries to fix this by returning false from
valid_mask_for_fold_vec_perm_cst if sel has a stepped sequence and
input vector has non-integral element type, so for VLA vectors, it
will only build result with dup sequence (nelts_per_pattern < 3) for
non-integral element type.

For VLS vectors, this will still work for stepped sequence since it
will then use the "VLS exception" in fold_vec_perm_cst, and set:
res_npattern = res_nelts and
res_nelts_per_pattern = 1

and fold the above case to:
F foo (F a, F b)
{
   [local count: 1073741824]:
  return { 0.0, 9.0e+0, 0.0, 0.0 };
}

But I am not sure if this is entirely correct, since:
tree res = out_elts.build ();
will canonicalize the encoding and may result in a stepped sequence
(vector_builder::finalize() may reduce npatterns at the cost of increasing
nelts_per_pattern)  ?

PS: This issue is now latent after PR111648 fix, since
valid_mask_for_fold_vec_perm_cst with  sel = {1, 0, 1, ...} returns
false because the corresponding pattern in arg0 is not a natural
stepped sequence, and folds correctly using VLS exception. However, I
guess the underlying issue of dealing with non-integral element types
in fold_vec_perm_cst still remains ?

The patch passes bootstrap+test with and without SVE on aarch64-linux-gnu,
and on x86_64-linux-gnu.

Thanks,
Prathamesh
diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index 82299bb7f1d..cedfc9616e9 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -10642,6 +10642,11 @@ valid_mask_for_fold_vec_perm_cst_p (tree arg0, tree 
arg1,
   if (sel_nelts_per_pattern < 3)
 return true;
 
+  /* If SEL contains stepped sequence, ensure that we are dealing with
+ integral vector_cst.  */
+  if (!INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (arg0
+return false;
+
   for (unsigned pattern = 0; pattern < sel_npatterns; pattern++)
 {
   poly_uint64 a1 = sel[pattern + sel_npatterns];
diff --git a/gcc/testsuite/gcc.dg/vect/pr111754.c 
b/gcc/testsuite/gcc.dg/vect/pr111754.c
new file mode 100644
index 000..7c1c16875c7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr111754.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+typedef float __attribute__((__vector_size__ (16))) F;
+
+F foo (F a, F b)
+{
+  F v = (F) { 9 };
+  return __builtin_shufflevector (v, v, 1, 0, 1, 2);
+}
+
+/* { dg-final { scan-tree-dump-not "VEC_PERM_EXPR" "optimized" } } */
+/* { dg-final { scan-tree-dump "return \{ 0.0, 9.0e\\+0, 0.0, 0.0 \}" 
"optimized" } } */


Re: PR111648: Fix wrong code-gen due to incorrect VEC_PERM_EXPR folding

2023-10-18 Thread Prathamesh Kulkarni
On Wed, 18 Oct 2023 at 23:22, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Tue, 17 Oct 2023 at 02:40, Richard Sandiford
> >  wrote:
> >> Prathamesh Kulkarni  writes:
> >> > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> >> > index 4f8561509ff..55a6a68c16c 100644
> >> > --- a/gcc/fold-const.cc
> >> > +++ b/gcc/fold-const.cc
> >> > @@ -10684,9 +10684,8 @@ valid_mask_for_fold_vec_perm_cst_p (tree arg0, 
> >> > tree arg1,
> >> >
> >> >/* Ensure that the stepped sequence always selects from the same
> >> >input pattern.  */
> >> > -  unsigned arg_npatterns
> >> > - = ((q1 & 1) == 0) ? VECTOR_CST_NPATTERNS (arg0)
> >> > -   : VECTOR_CST_NPATTERNS (arg1);
> >> > +  tree arg = ((q1 & 1) == 0) ? arg0 : arg1;
> >> > +  unsigned arg_npatterns = VECTOR_CST_NPATTERNS (arg);
> >> >
> >> >if (!multiple_p (step, arg_npatterns))
> >> >   {
> >> > @@ -10694,6 +10693,29 @@ valid_mask_for_fold_vec_perm_cst_p (tree arg0, 
> >> > tree arg1,
> >> >   *reason = "step is not multiple of npatterns";
> >> > return false;
> >> >   }
> >> > +
> >> > +  /* If a1 chooses base element from arg, ensure that it's a natural
> >> > +  stepped sequence, ie, (arg[2] - arg[1]) == (arg[1] - arg[0])
> >> > +  to preserve arg's encoding.  */
> >> > +
> >> > +  unsigned HOST_WIDE_INT index;
> >> > +  if (!r1.is_constant ())
> >> > + return false;
> >> > +  if (index < arg_npatterns)
> >> > + {
> >>
> >> I don't know whether it matters in practice, but I think the two conditions
> >> above are more natural as:
> >>
> >> if (maybe_lt (r1, arg_npatterns))
> >>   {
> >> unsigned HOST_WIDE_INT index;
> >> if (!r1.is_constant ())
> >>   return false;
> >>
> >> ...[code below]...
> >>   }
> >>
> >> > +   tree arg_elem0 = vector_cst_elt (arg, index);
> >> > +   tree arg_elem1 = vector_cst_elt (arg, index + arg_npatterns);
> >> > +   tree arg_elem2 = vector_cst_elt (arg, index + arg_npatterns * 2);
> >> > +
> >> > +   if (!operand_equal_p (const_binop (MINUS_EXPR, arg_elem2, 
> >> > arg_elem1),
> >> > + const_binop (MINUS_EXPR, arg_elem1, 
> >> > arg_elem0),
> >> > + 0))
> >>
> >> This needs to check whether const_binop returns null.  Maybe:
> >>
> >>tree step1, step2;
> >>if (!(step1 = const_binop (MINUS_EXPR, arg_elem1, arg_elem0))
> >>|| !(step2 = const_binop (MINUS_EXPR, arg_elem2, arg_elem1))
> >>|| !operand_equal_p (step1, step2, 0))
> >>
> >> OK with those changes, thanks.
> > Hi Richard,
> > Thanks for the suggestions, updated the attached patch accordingly.
> > Bootstrapped+tested with and without SVE on aarch64-linux-gnu and
> > x86_64-linux-gnu.
> > OK to commit ?
>
> Yes, thanks.
Thanks, committed to trunk in 3ec8ecb8e92faec889bc6f7aeac9ff59e82b4f7f.

Thanks,
Prathamesh
>
> Richard
>
> >
> > Thanks,
> > Prathamesh
> >>
> >> Richard
> >>
> >> > + {
> >> > +   if (reason)
> >> > + *reason = "not a natural stepped sequence";
> >> > +   return false;
> >> > + }
> >> > + }
> >> >  }
> >> >
> >> >return true;
> >> > @@ -17161,7 +17183,8 @@ namespace test_fold_vec_perm_cst {
> >> >  static tree
> >> >  build_vec_cst_rand (machine_mode vmode, unsigned npatterns,
> >> >   unsigned nelts_per_pattern,
> >> > - int step = 0, int threshold = 100)
> >> > + int step = 0, bool natural_stepped = false,
> >> > + int threshold = 100)
> >> >  {
> >> >tree inner_type = lang_hooks.types.type_for_mode (GET_MODE_INNER 
> >> > (vmode), 1);
> >> >tree vectype = build_vector_type_for_mode (inner_type, vmode);
> >> > @@ -17176,17 +17199,28 @@ build_vec_cst_rand (machine_mode vmode, 
> >&

Re: PR111648: Fix wrong code-gen due to incorrect VEC_PERM_EXPR folding

2023-10-17 Thread Prathamesh Kulkarni
On Tue, 17 Oct 2023 at 02:40, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Wed, 11 Oct 2023 at 16:57, Prathamesh Kulkarni
> >  wrote:
> >>
> >> On Wed, 11 Oct 2023 at 16:42, Prathamesh Kulkarni
> >>  wrote:
> >> >
> >> > On Mon, 9 Oct 2023 at 17:05, Richard Sandiford
> >> >  wrote:
> >> > >
> >> > > Prathamesh Kulkarni  writes:
> >> > > > Hi,
> >> > > > The attached patch attempts to fix PR111648.
> >> > > > As mentioned in PR, the issue is when a1 is a multiple of vector
> >> > > > length, we end up creating following encoding in result: { base_elem,
> >> > > > arg[0], arg[1], ... } (assuming S = 1),
> >> > > > where arg is chosen input vector, which is incorrect, since the
> >> > > > encoding originally in arg would be: { arg[0], arg[1], arg[2], ... }
> >> > > >
> >> > > > For the test-case mentioned in PR, vectorizer pass creates
> >> > > > VEC_PERM_EXPR where:
> >> > > > arg0: { -16, -9, -10, -11 }
> >> > > > arg1: { -12, -5, -6, -7 }
> >> > > > sel = { 3, 4, 5, 6 }
> >> > > >
> >> > > > arg0, arg1 and sel are encoded with npatterns = 1 and 
> >> > > > nelts_per_pattern = 3.
> >> > > > Since a1 = 4 and arg_len = 4, it ended up creating the result with
> >> > > > following encoding:
> >> > > > res = { arg0[3], arg1[0], arg1[1] } // npatterns = 1, 
> >> > > > nelts_per_pattern = 3
> >> > > >   = { -11, -12, -5 }
> >> > > >
> >> > > > So for res[3], it used S = (-5) - (-12) = 7
> >> > > > And hence computed it as -5 + 7 = 2.
> >> > > > instead of selecting arg1[2], ie, -6.
> >> > > >
> >> > > > The patch tweaks valid_mask_for_fold_vec_perm_cst_p to punt if a1 is 
> >> > > > a multiple
> >> > > > of vector length, so a1 ... ae select elements only from stepped part
> >> > > > of the pattern
> >> > > > from input vector and return false for this case.
> >> > > >
> >> > > > Since the vectors are VLS, fold_vec_perm_cst then sets:
> >> > > > res_npatterns = res_nelts
> >> > > > res_nelts_per_pattern  = 1
> >> > > > which seems to fix the issue by encoding all the elements.
> >> > > >
> >> > > > The patch resulted in Case 4 and Case 5 failing from 
> >> > > > test_nunits_min_2 because
> >> > > > they used sel = { 0, 0, 1, ... } and {len, 0, 1, ... } respectively,
> >> > > > which used a1 = 0, and thus selected arg1[0].
> >> > > >
> >> > > > I removed Case 4 because it was already covered in test_nunits_min_4,
> >> > > > and moved Case 5 to test_nunits_min_4, with sel = { len, 1, 2, ... }
> >> > > > and added a new Case 9 to test for this issue.
> >> > > >
> >> > > > Passes bootstrap+test on aarch64-linux-gnu with and without SVE,
> >> > > > and on x86_64-linux-gnu.
> >> > > > Does the patch look OK ?
> >> > > >
> >> > > > Thanks,
> >> > > > Prathamesh
> >> > > >
> >> > > > [PR111648] Fix wrong code-gen due to incorrect VEC_PERM_EXPR folding.
> >> > > >
> >> > > > gcc/ChangeLog:
> >> > > >   PR tree-optimization/111648
> >> > > >   * fold-const.cc (valid_mask_for_fold_vec_perm_cst_p): Punt if 
> >> > > > a1
> >> > > >   is a multiple of vector length.
> >> > > >   (test_nunits_min_2): Remove Case 4 and move Case 5 to ...
> >> > > >   (test_nunits_min_4): ... here and rename case numbers. Also add
> >> > > >   Case 9.
> >> > > >
> >> > > > gcc/testsuite/ChangeLog:
> >> > > >   PR tree-optimization/111648
> >> > > >   * gcc.dg/vect/pr111648.c: New test.
> >> > > >
> >> > > >
> >> > > > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> >> > > > index 4f8561509ff..c5f421d6b76 100644
> >> > > > --- a/gcc/fold-const.cc
> >> > > > +++ b/gcc/fold-const.cc
> &

Re: PR111648: Fix wrong code-gen due to incorrect VEC_PERM_EXPR folding

2023-10-12 Thread Prathamesh Kulkarni
On Wed, 11 Oct 2023 at 16:57, Prathamesh Kulkarni
 wrote:
>
> On Wed, 11 Oct 2023 at 16:42, Prathamesh Kulkarni
>  wrote:
> >
> > On Mon, 9 Oct 2023 at 17:05, Richard Sandiford
> >  wrote:
> > >
> > > Prathamesh Kulkarni  writes:
> > > > Hi,
> > > > The attached patch attempts to fix PR111648.
> > > > As mentioned in PR, the issue is when a1 is a multiple of vector
> > > > length, we end up creating following encoding in result: { base_elem,
> > > > arg[0], arg[1], ... } (assuming S = 1),
> > > > where arg is chosen input vector, which is incorrect, since the
> > > > encoding originally in arg would be: { arg[0], arg[1], arg[2], ... }
> > > >
> > > > For the test-case mentioned in PR, vectorizer pass creates
> > > > VEC_PERM_EXPR where:
> > > > arg0: { -16, -9, -10, -11 }
> > > > arg1: { -12, -5, -6, -7 }
> > > > sel = { 3, 4, 5, 6 }
> > > >
> > > > arg0, arg1 and sel are encoded with npatterns = 1 and nelts_per_pattern 
> > > > = 3.
> > > > Since a1 = 4 and arg_len = 4, it ended up creating the result with
> > > > following encoding:
> > > > res = { arg0[3], arg1[0], arg1[1] } // npatterns = 1, nelts_per_pattern 
> > > > = 3
> > > >   = { -11, -12, -5 }
> > > >
> > > > So for res[3], it used S = (-5) - (-12) = 7
> > > > And hence computed it as -5 + 7 = 2.
> > > > instead of selecting arg1[2], ie, -6.
> > > >
> > > > The patch tweaks valid_mask_for_fold_vec_perm_cst_p to punt if a1 is a 
> > > > multiple
> > > > of vector length, so a1 ... ae select elements only from stepped part
> > > > of the pattern
> > > > from input vector and return false for this case.
> > > >
> > > > Since the vectors are VLS, fold_vec_perm_cst then sets:
> > > > res_npatterns = res_nelts
> > > > res_nelts_per_pattern  = 1
> > > > which seems to fix the issue by encoding all the elements.
> > > >
> > > > The patch resulted in Case 4 and Case 5 failing from test_nunits_min_2 
> > > > because
> > > > they used sel = { 0, 0, 1, ... } and {len, 0, 1, ... } respectively,
> > > > which used a1 = 0, and thus selected arg1[0].
> > > >
> > > > I removed Case 4 because it was already covered in test_nunits_min_4,
> > > > and moved Case 5 to test_nunits_min_4, with sel = { len, 1, 2, ... }
> > > > and added a new Case 9 to test for this issue.
> > > >
> > > > Passes bootstrap+test on aarch64-linux-gnu with and without SVE,
> > > > and on x86_64-linux-gnu.
> > > > Does the patch look OK ?
> > > >
> > > > Thanks,
> > > > Prathamesh
> > > >
> > > > [PR111648] Fix wrong code-gen due to incorrect VEC_PERM_EXPR folding.
> > > >
> > > > gcc/ChangeLog:
> > > >   PR tree-optimization/111648
> > > >   * fold-const.cc (valid_mask_for_fold_vec_perm_cst_p): Punt if a1
> > > >   is a multiple of vector length.
> > > >   (test_nunits_min_2): Remove Case 4 and move Case 5 to ...
> > > >   (test_nunits_min_4): ... here and rename case numbers. Also add
> > > >   Case 9.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >   PR tree-optimization/111648
> > > >   * gcc.dg/vect/pr111648.c: New test.
> > > >
> > > >
> > > > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> > > > index 4f8561509ff..c5f421d6b76 100644
> > > > --- a/gcc/fold-const.cc
> > > > +++ b/gcc/fold-const.cc
> > > > @@ -10682,8 +10682,8 @@ valid_mask_for_fold_vec_perm_cst_p (tree arg0, 
> > > > tree arg1,
> > > > return false;
> > > >   }
> > > >
> > > > -  /* Ensure that the stepped sequence always selects from the same
> > > > -  input pattern.  */
> > > > +  /* Ensure that the stepped sequence always selects from the 
> > > > stepped
> > > > +  part of same input pattern.  */
> > > >unsigned arg_npatterns
> > > >   = ((q1 & 1) == 0) ? VECTOR_CST_NPATTERNS (arg0)
> > > > : VECTOR_CST_NPATTERNS (arg1);
> > > > @@ -10694,6 +10694,20 @@ valid_mask_for_fold_vec_perm_cst_p (tree arg0, 
> > > > tree arg1,
> > > >

Re: PR111648: Fix wrong code-gen due to incorrect VEC_PERM_EXPR folding

2023-10-11 Thread Prathamesh Kulkarni
On Wed, 11 Oct 2023 at 16:42, Prathamesh Kulkarni
 wrote:
>
> On Mon, 9 Oct 2023 at 17:05, Richard Sandiford
>  wrote:
> >
> > Prathamesh Kulkarni  writes:
> > > Hi,
> > > The attached patch attempts to fix PR111648.
> > > As mentioned in PR, the issue is when a1 is a multiple of vector
> > > length, we end up creating following encoding in result: { base_elem,
> > > arg[0], arg[1], ... } (assuming S = 1),
> > > where arg is chosen input vector, which is incorrect, since the
> > > encoding originally in arg would be: { arg[0], arg[1], arg[2], ... }
> > >
> > > For the test-case mentioned in PR, vectorizer pass creates
> > > VEC_PERM_EXPR where:
> > > arg0: { -16, -9, -10, -11 }
> > > arg1: { -12, -5, -6, -7 }
> > > sel = { 3, 4, 5, 6 }
> > >
> > > arg0, arg1 and sel are encoded with npatterns = 1 and nelts_per_pattern = 
> > > 3.
> > > Since a1 = 4 and arg_len = 4, it ended up creating the result with
> > > following encoding:
> > > res = { arg0[3], arg1[0], arg1[1] } // npatterns = 1, nelts_per_pattern = 
> > > 3
> > >   = { -11, -12, -5 }
> > >
> > > So for res[3], it used S = (-5) - (-12) = 7
> > > And hence computed it as -5 + 7 = 2.
> > > instead of selecting arg1[2], ie, -6.
> > >
> > > The patch tweaks valid_mask_for_fold_vec_perm_cst_p to punt if a1 is a 
> > > multiple
> > > of vector length, so a1 ... ae select elements only from stepped part
> > > of the pattern
> > > from input vector and return false for this case.
> > >
> > > Since the vectors are VLS, fold_vec_perm_cst then sets:
> > > res_npatterns = res_nelts
> > > res_nelts_per_pattern  = 1
> > > which seems to fix the issue by encoding all the elements.
> > >
> > > The patch resulted in Case 4 and Case 5 failing from test_nunits_min_2 
> > > because
> > > they used sel = { 0, 0, 1, ... } and {len, 0, 1, ... } respectively,
> > > which used a1 = 0, and thus selected arg1[0].
> > >
> > > I removed Case 4 because it was already covered in test_nunits_min_4,
> > > and moved Case 5 to test_nunits_min_4, with sel = { len, 1, 2, ... }
> > > and added a new Case 9 to test for this issue.
> > >
> > > Passes bootstrap+test on aarch64-linux-gnu with and without SVE,
> > > and on x86_64-linux-gnu.
> > > Does the patch look OK ?
> > >
> > > Thanks,
> > > Prathamesh
> > >
> > > [PR111648] Fix wrong code-gen due to incorrect VEC_PERM_EXPR folding.
> > >
> > > gcc/ChangeLog:
> > >   PR tree-optimization/111648
> > >   * fold-const.cc (valid_mask_for_fold_vec_perm_cst_p): Punt if a1
> > >   is a multiple of vector length.
> > >   (test_nunits_min_2): Remove Case 4 and move Case 5 to ...
> > >   (test_nunits_min_4): ... here and rename case numbers. Also add
> > >   Case 9.
> > >
> > > gcc/testsuite/ChangeLog:
> > >   PR tree-optimization/111648
> > >   * gcc.dg/vect/pr111648.c: New test.
> > >
> > >
> > > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> > > index 4f8561509ff..c5f421d6b76 100644
> > > --- a/gcc/fold-const.cc
> > > +++ b/gcc/fold-const.cc
> > > @@ -10682,8 +10682,8 @@ valid_mask_for_fold_vec_perm_cst_p (tree arg0, 
> > > tree arg1,
> > > return false;
> > >   }
> > >
> > > -  /* Ensure that the stepped sequence always selects from the same
> > > -  input pattern.  */
> > > +  /* Ensure that the stepped sequence always selects from the stepped
> > > +  part of same input pattern.  */
> > >unsigned arg_npatterns
> > >   = ((q1 & 1) == 0) ? VECTOR_CST_NPATTERNS (arg0)
> > > : VECTOR_CST_NPATTERNS (arg1);
> > > @@ -10694,6 +10694,20 @@ valid_mask_for_fold_vec_perm_cst_p (tree arg0, 
> > > tree arg1,
> > >   *reason = "step is not multiple of npatterns";
> > > return false;
> > >   }
> > > +
> > > +  /* If a1 is a multiple of len, it will select base element of input
> > > +  vector resulting in following encoding:
> > > +  { base_elem, arg[0], arg[1], ... } where arg is the chosen input
> > > +  vector. This encoding is not originally present in arg, since it's
> > > +  defined as:
> > > +  { arg[0], arg[1], arg[2],

Re: PR111648: Fix wrong code-gen due to incorrect VEC_PERM_EXPR folding

2023-10-11 Thread Prathamesh Kulkarni
On Mon, 9 Oct 2023 at 17:05, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > Hi,
> > The attached patch attempts to fix PR111648.
> > As mentioned in PR, the issue is when a1 is a multiple of vector
> > length, we end up creating following encoding in result: { base_elem,
> > arg[0], arg[1], ... } (assuming S = 1),
> > where arg is chosen input vector, which is incorrect, since the
> > encoding originally in arg would be: { arg[0], arg[1], arg[2], ... }
> >
> > For the test-case mentioned in PR, vectorizer pass creates
> > VEC_PERM_EXPR where:
> > arg0: { -16, -9, -10, -11 }
> > arg1: { -12, -5, -6, -7 }
> > sel = { 3, 4, 5, 6 }
> >
> > arg0, arg1 and sel are encoded with npatterns = 1 and nelts_per_pattern = 3.
> > Since a1 = 4 and arg_len = 4, it ended up creating the result with
> > following encoding:
> > res = { arg0[3], arg1[0], arg1[1] } // npatterns = 1, nelts_per_pattern = 3
> >   = { -11, -12, -5 }
> >
> > So for res[3], it used S = (-5) - (-12) = 7
> > And hence computed it as -5 + 7 = 2.
> > instead of selecting arg1[2], ie, -6.
> >
> > The patch tweaks valid_mask_for_fold_vec_perm_cst_p to punt if a1 is a 
> > multiple
> > of vector length, so a1 ... ae select elements only from stepped part
> > of the pattern
> > from input vector and return false for this case.
> >
> > Since the vectors are VLS, fold_vec_perm_cst then sets:
> > res_npatterns = res_nelts
> > res_nelts_per_pattern  = 1
> > which seems to fix the issue by encoding all the elements.
> >
> > The patch resulted in Case 4 and Case 5 failing from test_nunits_min_2 
> > because
> > they used sel = { 0, 0, 1, ... } and {len, 0, 1, ... } respectively,
> > which used a1 = 0, and thus selected arg1[0].
> >
> > I removed Case 4 because it was already covered in test_nunits_min_4,
> > and moved Case 5 to test_nunits_min_4, with sel = { len, 1, 2, ... }
> > and added a new Case 9 to test for this issue.
> >
> > Passes bootstrap+test on aarch64-linux-gnu with and without SVE,
> > and on x86_64-linux-gnu.
> > Does the patch look OK ?
> >
> > Thanks,
> > Prathamesh
> >
> > [PR111648] Fix wrong code-gen due to incorrect VEC_PERM_EXPR folding.
> >
> > gcc/ChangeLog:
> >   PR tree-optimization/111648
> >   * fold-const.cc (valid_mask_for_fold_vec_perm_cst_p): Punt if a1
> >   is a multiple of vector length.
> >   (test_nunits_min_2): Remove Case 4 and move Case 5 to ...
> >   (test_nunits_min_4): ... here and rename case numbers. Also add
> >   Case 9.
> >
> > gcc/testsuite/ChangeLog:
> >   PR tree-optimization/111648
> >   * gcc.dg/vect/pr111648.c: New test.
> >
> >
> > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> > index 4f8561509ff..c5f421d6b76 100644
> > --- a/gcc/fold-const.cc
> > +++ b/gcc/fold-const.cc
> > @@ -10682,8 +10682,8 @@ valid_mask_for_fold_vec_perm_cst_p (tree arg0, tree 
> > arg1,
> > return false;
> >   }
> >
> > -  /* Ensure that the stepped sequence always selects from the same
> > -  input pattern.  */
> > +  /* Ensure that the stepped sequence always selects from the stepped
> > +  part of same input pattern.  */
> >unsigned arg_npatterns
> >   = ((q1 & 1) == 0) ? VECTOR_CST_NPATTERNS (arg0)
> > : VECTOR_CST_NPATTERNS (arg1);
> > @@ -10694,6 +10694,20 @@ valid_mask_for_fold_vec_perm_cst_p (tree arg0, 
> > tree arg1,
> >   *reason = "step is not multiple of npatterns";
> > return false;
> >   }
> > +
> > +  /* If a1 is a multiple of len, it will select base element of input
> > +  vector resulting in following encoding:
> > +  { base_elem, arg[0], arg[1], ... } where arg is the chosen input
> > +  vector. This encoding is not originally present in arg, since it's
> > +  defined as:
> > +  { arg[0], arg[1], arg[2], ... }.  */
> > +
> > +  if (multiple_p (a1, arg_len))
> > + {
> > +   if (reason)
> > + *reason = "selecting base element of input vector";
> > +   return false;
> > + }
>
> That wouldn't catch (for example) cases where a1 == arg_len + 1 and the
> second argument has 2 stepped patterns.
Ah right, thanks for pointing out. In the attached patch I extended the check
so that r1 < arg_npatterns which should check if we are choosing base
elements from any of the patterns in arg (and not just

PR111648: Fix wrong code-gen due to incorrect VEC_PERM_EXPR folding

2023-10-04 Thread Prathamesh Kulkarni
Hi,
The attached patch attempts to fix PR111648.
As mentioned in PR, the issue is when a1 is a multiple of vector
length, we end up creating following encoding in result: { base_elem,
arg[0], arg[1], ... } (assuming S = 1),
where arg is chosen input vector, which is incorrect, since the
encoding originally in arg would be: { arg[0], arg[1], arg[2], ... }

For the test-case mentioned in PR, vectorizer pass creates
VEC_PERM_EXPR where:
arg0: { -16, -9, -10, -11 }
arg1: { -12, -5, -6, -7 }
sel = { 3, 4, 5, 6 }

arg0, arg1 and sel are encoded with npatterns = 1 and nelts_per_pattern = 3.
Since a1 = 4 and arg_len = 4, it ended up creating the result with
following encoding:
res = { arg0[3], arg1[0], arg1[1] } // npatterns = 1, nelts_per_pattern = 3
  = { -11, -12, -5 }

So for res[3], it used S = (-5) - (-12) = 7
And hence computed it as -5 + 7 = 2.
instead of selecting arg1[2], ie, -6.

The patch tweaks valid_mask_for_fold_vec_perm_cst_p to punt if a1 is a multiple
of vector length, so a1 ... ae select elements only from stepped part
of the pattern
from input vector and return false for this case.

Since the vectors are VLS, fold_vec_perm_cst then sets:
res_npatterns = res_nelts
res_nelts_per_pattern  = 1
which seems to fix the issue by encoding all the elements.

The patch resulted in Case 4 and Case 5 failing from test_nunits_min_2 because
they used sel = { 0, 0, 1, ... } and {len, 0, 1, ... } respectively,
which used a1 = 0, and thus selected arg1[0].

I removed Case 4 because it was already covered in test_nunits_min_4,
and moved Case 5 to test_nunits_min_4, with sel = { len, 1, 2, ... }
and added a new Case 9 to test for this issue.

Passes bootstrap+test on aarch64-linux-gnu with and without SVE,
and on x86_64-linux-gnu.
Does the patch look OK ?

Thanks,
Prathamesh
[PR111648] Fix wrong code-gen due to incorrect VEC_PERM_EXPR folding.

gcc/ChangeLog:
PR tree-optimization/111648
* fold-const.cc (valid_mask_for_fold_vec_perm_cst_p): Punt if a1
is a multiple of vector length.
(test_nunits_min_2): Remove Case 4 and move Case 5 to ...
(test_nunits_min_4): ... here and rename case numbers. Also add
Case 9.

gcc/testsuite/ChangeLog:
PR tree-optimization/111648
* gcc.dg/vect/pr111648.c: New test.


diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index 4f8561509ff..c5f421d6b76 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -10682,8 +10682,8 @@ valid_mask_for_fold_vec_perm_cst_p (tree arg0, tree 
arg1,
  return false;
}
 
-  /* Ensure that the stepped sequence always selects from the same
-input pattern.  */
+  /* Ensure that the stepped sequence always selects from the stepped
+part of same input pattern.  */
   unsigned arg_npatterns
= ((q1 & 1) == 0) ? VECTOR_CST_NPATTERNS (arg0)
  : VECTOR_CST_NPATTERNS (arg1);
@@ -10694,6 +10694,20 @@ valid_mask_for_fold_vec_perm_cst_p (tree arg0, tree 
arg1,
*reason = "step is not multiple of npatterns";
  return false;
}
+
+  /* If a1 is a multiple of len, it will select base element of input
+vector resulting in following encoding:
+{ base_elem, arg[0], arg[1], ... } where arg is the chosen input
+vector. This encoding is not originally present in arg, since it's
+defined as:
+{ arg[0], arg[1], arg[2], ... }.  */
+
+  if (multiple_p (a1, arg_len))
+   {
+ if (reason)
+   *reason = "selecting base element of input vector";
+ return false;
+   }
 }
 
   return true;
@@ -17425,47 +17439,6 @@ test_nunits_min_2 (machine_mode vmode)
tree expected_res[] = { ARG0(0), ARG1(0), ARG0(1), ARG1(1) };
validate_res (2, 2, res, expected_res);
   }
-
-  /* Case 4: mask = {0, 0, 1, ...} // (1, 3)
-Test that the stepped sequence of the pattern selects from
-same input pattern. Since input vectors have npatterns = 2,
-and step (a2 - a1) = 1, step is not a multiple of npatterns
-in input vector. So return NULL_TREE.  */
-  {
-   tree arg0 = build_vec_cst_rand (vmode, 2, 3, 1);
-   tree arg1 = build_vec_cst_rand (vmode, 2, 3, 1);
-   poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
-
-   vec_perm_builder builder (len, 1, 3);
-   poly_uint64 mask_elems[] = { 0, 0, 1 };
-   builder_push_elems (builder, mask_elems);
-
-   vec_perm_indices sel (builder, 2, len);
-   const char *reason;
-   tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel,
- );
-   ASSERT_TRUE (res == NULL_TREE);
-   ASSERT_TRUE (!strcmp (reason, "step is not multiple of npatterns"));
-  }
-
-  /* Case 5: mask = {len, 0, 1, ...} // (1, 3)
-Test that stepped sequence of the pattern selects from arg0.
-res = { arg1[0], arg0[0], arg0[1], ... } // (1, 3)  */
-  {
-   tree arg0 = 

Re: [AArch64][testsuite] Adjust vect_copy_lane_1.c for new code-gen

2023-09-18 Thread Prathamesh Kulkarni via Gcc-patches
On Sun, 17 Sept 2023 at 20:11, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > Hi,
> > After 27de9aa152141e7f3ee66372647d0f2cd94c4b90, there's a following 
> > regression:
> > FAIL: gcc.target/aarch64/vect_copy_lane_1.c scan-assembler-times
> > ins\\tv0.s\\[1\\], v1.s\\[0\\] 3
> >
> > This happens because for the following function from vect_copy_lane_1.c:
> > float32x2_t
> > __attribute__((noinline, noclone)) test_copy_lane_f32 (float32x2_t a,
> > float32x2_t b)
> > {
> >   return vcopy_lane_f32 (a, 1, b, 0);
> > }
> >
> > Before 27de9aa152141e7f3ee66372647d0f2cd94c4b90,
> > it got lowered to following sequence in .optimized dump:
> >[local count: 1073741824]:
> >   _4 = BIT_FIELD_REF ;
> >   __a_5 = BIT_INSERT_EXPR ;
> >   return __a_5;
> >
> > The above commit simplifies BIT_FIELD_REF + BIT_INSERT_EXPR
> > to vector permutation and now thus gets lowered to:
> >
> >[local count: 1073741824]:
> >   __a_4 = VEC_PERM_EXPR ;
> >   return __a_4;
> >
> > Since we give higher priority to aarch64_evpc_zip over aarch64_evpc_ins
> > in aarch64_expand_vec_perm_const_1, it now generates:
> >
> > test_copy_lane_f32:
> > zip1v0.2s, v0.2s, v1.2s
> > ret
> >
> > Similarly for test_copy_lane_[us]32.
>
> Yeah, I suppose this choice is at least as good as INS.  It has the advantage
> that the source and destination don't need to be tied.  For example:
>
> int32x2_t f(int32x2_t a, int32x2_t b, int32x2_t c) {
> return vcopy_lane_s32 (b, 1, c, 0);
> }
>
> used to be:
>
> f:
> mov v0.8b, v1.8b
> ins v0.s[1], v2.s[0]
> ret
>
> but is now:
>
> f:
> zip1v0.2s, v1.2s, v2.2s
> ret
>
> > The attached patch adjusts the tests to reflect the change in code-gen
> > and the tests pass.
> > OK to commit ?
> >
> > Thanks,
> > Prathamesh
> >
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c 
> > b/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c
> > index 2848be564d5..811dc678b92 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c
> > @@ -22,7 +22,7 @@ BUILD_TEST (uint16x4_t, uint16x4_t, , , u16, 3, 2)
> >  BUILD_TEST (float32x2_t, float32x2_t, , , f32, 1, 0)
> >  BUILD_TEST (int32x2_t,   int32x2_t,   , , s32, 1, 0)
> >  BUILD_TEST (uint32x2_t,  uint32x2_t,  , , u32, 1, 0)
> > -/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[0\\\]" 
> > 3 } } */
> > +/* { dg-final { scan-assembler-times "zip1\\tv0.2s, v0.2s, v1.2s" 3 } } */
> >  BUILD_TEST (int64x1_t,   int64x1_t,   , , s64, 0, 0)
> >  BUILD_TEST (uint64x1_t,  uint64x1_t,  , , u64, 0, 0)
> >  BUILD_TEST (float64x1_t, float64x1_t, , , f64, 0, 0)
>
> OK, thanks.
Thanks, committed to trunk in 98c25cfc79a21886de7342fb563c4eb3c3d5f4e9.

Thanks,
Prathamesh
>
> Richard


[AArch64][testsuite] Adjust vect_copy_lane_1.c for new code-gen

2023-09-13 Thread Prathamesh Kulkarni via Gcc-patches
Hi,
After 27de9aa152141e7f3ee66372647d0f2cd94c4b90, there's a following regression:
FAIL: gcc.target/aarch64/vect_copy_lane_1.c scan-assembler-times
ins\\tv0.s\\[1\\], v1.s\\[0\\] 3

This happens because for the following function from vect_copy_lane_1.c:
float32x2_t
__attribute__((noinline, noclone)) test_copy_lane_f32 (float32x2_t a,
float32x2_t b)
{
  return vcopy_lane_f32 (a, 1, b, 0);
}

Before 27de9aa152141e7f3ee66372647d0f2cd94c4b90,
it got lowered to following sequence in .optimized dump:
   [local count: 1073741824]:
  _4 = BIT_FIELD_REF ;
  __a_5 = BIT_INSERT_EXPR ;
  return __a_5;

The above commit simplifies BIT_FIELD_REF + BIT_INSERT_EXPR
to vector permutation and now thus gets lowered to:

   [local count: 1073741824]:
  __a_4 = VEC_PERM_EXPR ;
  return __a_4;

Since we give higher priority to aarch64_evpc_zip over aarch64_evpc_ins
in aarch64_expand_vec_perm_const_1, it now generates:

test_copy_lane_f32:
zip1v0.2s, v0.2s, v1.2s
ret

Similarly for test_copy_lane_[us]32.
The attached patch adjusts the tests to reflect the change in code-gen
and the tests pass.
OK to commit ?

Thanks,
Prathamesh
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c 
b/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c
index 2848be564d5..811dc678b92 100644
--- a/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vect_copy_lane_1.c
@@ -22,7 +22,7 @@ BUILD_TEST (uint16x4_t, uint16x4_t, , , u16, 3, 2)
 BUILD_TEST (float32x2_t, float32x2_t, , , f32, 1, 0)
 BUILD_TEST (int32x2_t,   int32x2_t,   , , s32, 1, 0)
 BUILD_TEST (uint32x2_t,  uint32x2_t,  , , u32, 1, 0)
-/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[0\\\]" 3 } 
} */
+/* { dg-final { scan-assembler-times "zip1\\tv0.2s, v0.2s, v1.2s" 3 } } */
 BUILD_TEST (int64x1_t,   int64x1_t,   , , s64, 0, 0)
 BUILD_TEST (uint64x1_t,  uint64x1_t,  , , u64, 0, 0)
 BUILD_TEST (float64x1_t, float64x1_t, , , f64, 0, 0)


Re: [pushed] analyzer: fix ICE in text art strings support

2023-08-31 Thread Prathamesh Kulkarni via Gcc-patches
On Wed, 30 Aug 2023 at 19:20, David Malcolm  wrote:
>
> On Wed, 2023-08-30 at 11:52 +0530, Prathamesh Kulkarni wrote:
> > On Wed, 30 Aug 2023 at 04:21, David Malcolm 
> > wrote:
> > >
> > > On Tue, 2023-08-29 at 11:01 +0530, Prathamesh Kulkarni wrote:
> > > > On Fri, 25 Aug 2023 at 18:15, David Malcolm via Gcc-patches
> > > >  wrote:
> > > > >
> > > > > Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
> > > > > Pushed to trunk as r14-3481-g99a3fcb8ff0bf2.
> > > > Hi David,
> > > > It seems the new tests FAIL on arm for LTO bootstrap config:
> > > > https://ci.linaro.org/job/tcwg_bootstrap_check--master-arm-check_bootstrap_lto-build/263/artifact/artifacts/06-check_regression/fails.sum/*view*/
> > >
> > > Sorry about this.
> > >
> > > Looking at e.g. the console.log.xz, I just see the status of the
> > > failing tests.
> > >
> > > Is there an easy way to get at the stderr from the tests without
> > > rerunning this?
> > >
> > > Otherwise, I'd appreciate help with reproducing this.
> > Hi David,
> > I have attached make check log for the failing tests.
> > To reproduce, I configured and built gcc with following options on
> > armv8 machine:
> > ../gcc/configure --enable-languages=c,c++,fortran --with-float=hard
> > --with-fpu=neon-fp-armv8 --with-mode=thumb --with-arch=armv8-a
> > --disable-werror --with-build-config=bootstrap-lto
> > make -j$(nproc)
>
> Thanks.
>
> Looks a lot like PR analyzer/110483, which I'm working on now (sorry!)
>
> What's the endianness of the host?
Little endian. It was built natively (host == target) on
armv8l-unknown-linux-gnueabihf.
>
>
> Specifically, the pertinent part of the log is:
>
> FAIL: gcc.dg/analyzer/out-of-bounds-diagram-17.c (test for excess errors)
> Excess errors:
>┌─┬─┬┬┬┐┌─┬─┬─┐
>│ [1] │ [1] │[1] │[1] │[1] ││ [1] │ [1] │ [1] │
>├─┼─┼┼┼┤├─┼─┼─┤
>│ ' ' │ 'w' │'o' │'r' │'l' ││ 'd' │ '!' │ NUL │
>├─┴─┴┴┴┴┴─┴─┴─┤
>│  string literal (type: 'char[8]')   │
>└─┘
>   │ ││││  │ │ │
>   │ ││││  │ │ │
>   v vvvv  v v v
>   ┌─┬┬┐┌─┐
>   │ [0] │  ...   │[9] ││ │
>   ├─┴┴┤│after valid range│
>   │ 'buf' (type: 'char[10]')  ││ │
>   └───┘└─┘
>   ├─┬─┤├┬┤
> │   │
>   ╭─┴╮╭─┴─╮
>   │capacity: 10 bytes││overflow of 3 bytes│
>   ╰──╯╰───╯
>
> where the issue seems to be all those [1], which are meant to be index
> [0], [1], [2], etc.
Oh OK, thanks for the clarification!

Thanks,
Prathamesh
>
>
> Dave


Re: [pushed] analyzer: fix ICE in text art strings support

2023-08-30 Thread Prathamesh Kulkarni via Gcc-patches
On Wed, 30 Aug 2023 at 04:21, David Malcolm  wrote:
>
> On Tue, 2023-08-29 at 11:01 +0530, Prathamesh Kulkarni wrote:
> > On Fri, 25 Aug 2023 at 18:15, David Malcolm via Gcc-patches
> >  wrote:
> > >
> > > Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
> > > Pushed to trunk as r14-3481-g99a3fcb8ff0bf2.
> > Hi David,
> > It seems the new tests FAIL on arm for LTO bootstrap config:
> > https://ci.linaro.org/job/tcwg_bootstrap_check--master-arm-check_bootstrap_lto-build/263/artifact/artifacts/06-check_regression/fails.sum/*view*/
>
> Sorry about this.
>
> Looking at e.g. the console.log.xz, I just see the status of the
> failing tests.
>
> Is there an easy way to get at the stderr from the tests without
> rerunning this?
>
> Otherwise, I'd appreciate help with reproducing this.
Hi David,
I have attached make check log for the failing tests.
To reproduce, I configured and built gcc with following options on
armv8 machine:
../gcc/configure --enable-languages=c,c++,fortran --with-float=hard
--with-fpu=neon-fp-armv8 --with-mode=thumb --with-arch=armv8-a
--disable-werror --with-build-config=bootstrap-lto
make -j$(nproc)

Thanks,
Prathamesh
>
> Thanks
> Dave
>
> > Please let me know if you need any help in reproducing these
> > failures.
> >
> > Thanks,
> > Prathamesh
> > >
> > > gcc/analyzer/ChangeLog:
> > > * access-diagram.cc (class string_region_spatial_item):
> > > Remove
> > > assumption that the string is written to the start of the
> > > cluster.
> > >
> > > gcc/testsuite/ChangeLog:
> > > * gcc.dg/analyzer/out-of-bounds-diagram-17.c: New test.
> > > * gcc.dg/analyzer/out-of-bounds-diagram-18.c: New test.
> > > * gcc.dg/analyzer/out-of-bounds-diagram-19.c: New test.
> > > ---
> > >  gcc/analyzer/access-diagram.cc| 57 ---
> > > 
> > >  .../analyzer/out-of-bounds-diagram-17.c   | 34 +++
> > >  .../analyzer/out-of-bounds-diagram-18.c   | 38 +
> > >  .../analyzer/out-of-bounds-diagram-19.c   | 45 +++
> > >  4 files changed, 155 insertions(+), 19 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.dg/analyzer/out-of-bounds-
> > > diagram-17.c
> > >  create mode 100644 gcc/testsuite/gcc.dg/analyzer/out-of-bounds-
> > > diagram-18.c
> > >  create mode 100644 gcc/testsuite/gcc.dg/analyzer/out-of-bounds-
> > > diagram-19.c
> > >
> > > diff --git a/gcc/analyzer/access-diagram.cc b/gcc/analyzer/access-
> > > diagram.cc
> > > index d7b669a4e38e..a51d594b5b2c 100644
> > > --- a/gcc/analyzer/access-diagram.cc
> > > +++ b/gcc/analyzer/access-diagram.cc
> > > @@ -1509,10 +1509,16 @@ public:
> > >out.add_all_bytes_in_range (m_actual_bits);
> > >  else
> > >{
> > > -   byte_range head_of_string (0, m_ellipsis_head_len);
> > > +   byte_range bytes (0, 0);
> > > +   bool valid = m_actual_bits.as_concrete_byte_range ();
> > > +   gcc_assert (valid);
> > > +   byte_range head_of_string (bytes.get_start_byte_offset (),
> > > +  m_ellipsis_head_len);
> > > out.add_all_bytes_in_range (head_of_string);
> > > byte_range tail_of_string
> > > - (TREE_STRING_LENGTH (string_cst) - m_ellipsis_tail_len,
> > > + ((bytes.get_start_byte_offset ()
> > > +   + TREE_STRING_LENGTH (string_cst)
> > > +   - m_ellipsis_tail_len),
> > >m_ellipsis_tail_len);
> > > out.add_all_bytes_in_range (tail_of_string);
> > > /* Adding the above pair of ranges will also effectively
> > > add
> > > @@ -1535,11 +1541,14 @@ public:
> > >  tree string_cst = get_string_cst ();
> > >  if (m_show_full_string)
> > >{
> > > -   for (byte_offset_t byte_idx = bytes.get_start_byte_offset
> > > ();
> > > -   byte_idx < bytes.get_next_byte_offset ();
> > > -   byte_idx = byte_idx + 1)
> > > -add_column_for_byte (t, btm, sm, byte_idx,
> > > - byte_idx_table_y, byte_val_table_y);
> > > +   for (byte_offset_t byte_idx_within_cluster
> > > + = bytes.get_start_byte_offset ();
> > > +   byte_idx_within_cluster < bytes.get_next_byte_offset
> > > ();
> > > +  

Re: [PATCH 5/9] arm: [MVE intrinsics] add support for p8 and p16 polynomial types

2023-08-29 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 15 Aug 2023 at 00:05, Christophe Lyon via Gcc-patches
 wrote:
>
> Although they look like aliases for u8 and u16, we need to define them
> so that we can handle p8 and p16 suffixes with the general framework.
>
> They will be used by vmull[bt]q_poly intrinsics.
Hi Christophe,
It seems your patch committed in 9bae37ec8dc32027dedf9a32bf15754ebad6da38
broke arm bootstrap build due to Werror=missing-field-initializers:
https://ci.linaro.org/job/tcwg_bootstrap_build--master-arm-bootstrap-build/199/artifact/artifacts/notify/mail-body.txt/*view*/

I think this happens because the commit adds a new member to type_suffix_info:
-  unsigned int spare : 13;
+  /* True if the suffix is for a polynomial type.  */
+  unsigned int poly_p : 1;
+  unsigned int spare : 12;

but probably misses an initializer in arm-mve-builtins.cc:type_suffixes:
  { "", NUM_VECTOR_TYPES, TYPE_bool, 0, 0, false, false, false,
0, VOIDmode }

Thanks,
Prathamesh
>
> 2023-08-14  Christophe Lyon  
>
> gcc/
> * config/arm/arm-mve-builtins.cc (type_suffixes): Handle poly_p
> field..
> (TYPES_poly_8_16): New.
> (poly_8_16): New.
> * config/arm/arm-mve-builtins.def (p8): New type suffix.
> (p16): Likewise.
> * config/arm/arm-mve-builtins.h (enum type_class_index): Add
> TYPE_poly.
> (struct type_suffix_info): Add poly_p field.
> ---
>  gcc/config/arm/arm-mve-builtins.cc  | 6 ++
>  gcc/config/arm/arm-mve-builtins.def | 2 ++
>  gcc/config/arm/arm-mve-builtins.h   | 5 -
>  3 files changed, 12 insertions(+), 1 deletion(-)
>
> diff --git a/gcc/config/arm/arm-mve-builtins.cc 
> b/gcc/config/arm/arm-mve-builtins.cc
> index 7eec9d2861c..fa8b0ad36b3 100644
> --- a/gcc/config/arm/arm-mve-builtins.cc
> +++ b/gcc/config/arm/arm-mve-builtins.cc
> @@ -128,6 +128,7 @@ CONSTEXPR const type_suffix_info 
> type_suffixes[NUM_TYPE_SUFFIXES + 1] = {
>  TYPE_##CLASS == TYPE_signed || TYPE_##CLASS == TYPE_unsigned, \
>  TYPE_##CLASS == TYPE_unsigned, \
>  TYPE_##CLASS == TYPE_float, \
> +TYPE_##CLASS == TYPE_poly, \
>  0, \
>  MODE },
>  #include "arm-mve-builtins.def"
> @@ -177,6 +178,10 @@ CONSTEXPR const type_suffix_info 
> type_suffixes[NUM_TYPE_SUFFIXES + 1] = {
>  #define TYPES_all_signed(S, D) \
>S (s8), S (s16), S (s32)
>
> +/* _p8 _p16.  */
> +#define TYPES_poly_8_16(S, D) \
> +  S (p8), S (p16)
> +
>  /* _u8 _u16 _u32.  */
>  #define TYPES_all_unsigned(S, D) \
>S (u8), S (u16), S (u32)
> @@ -275,6 +280,7 @@ DEF_MVE_TYPES_ARRAY (integer_8);
>  DEF_MVE_TYPES_ARRAY (integer_8_16);
>  DEF_MVE_TYPES_ARRAY (integer_16_32);
>  DEF_MVE_TYPES_ARRAY (integer_32);
> +DEF_MVE_TYPES_ARRAY (poly_8_16);
>  DEF_MVE_TYPES_ARRAY (signed_16_32);
>  DEF_MVE_TYPES_ARRAY (signed_32);
>  DEF_MVE_TYPES_ARRAY (reinterpret_integer);
> diff --git a/gcc/config/arm/arm-mve-builtins.def 
> b/gcc/config/arm/arm-mve-builtins.def
> index e3f37876210..e2cf1baf370 100644
> --- a/gcc/config/arm/arm-mve-builtins.def
> +++ b/gcc/config/arm/arm-mve-builtins.def
> @@ -63,6 +63,8 @@ DEF_MVE_TYPE_SUFFIX (u8, uint8x16_t, unsigned, 8, V16QImode)
>  DEF_MVE_TYPE_SUFFIX (u16, uint16x8_t, unsigned, 16, V8HImode)
>  DEF_MVE_TYPE_SUFFIX (u32, uint32x4_t, unsigned, 32, V4SImode)
>  DEF_MVE_TYPE_SUFFIX (u64, uint64x2_t, unsigned, 64, V2DImode)
> +DEF_MVE_TYPE_SUFFIX (p8, uint8x16_t, poly, 8, V16QImode)
> +DEF_MVE_TYPE_SUFFIX (p16, uint16x8_t, poly, 16, V8HImode)
>  #undef REQUIRES_FLOAT
>
>  #define REQUIRES_FLOAT true
> diff --git a/gcc/config/arm/arm-mve-builtins.h 
> b/gcc/config/arm/arm-mve-builtins.h
> index c9b51a0c77b..37b8223dfb2 100644
> --- a/gcc/config/arm/arm-mve-builtins.h
> +++ b/gcc/config/arm/arm-mve-builtins.h
> @@ -146,6 +146,7 @@ enum type_class_index
>TYPE_float,
>TYPE_signed,
>TYPE_unsigned,
> +  TYPE_poly,
>NUM_TYPE_CLASSES
>  };
>
> @@ -221,7 +222,9 @@ struct type_suffix_info
>unsigned int unsigned_p : 1;
>/* True if the suffix is for a floating-point type.  */
>unsigned int float_p : 1;
> -  unsigned int spare : 13;
> +  /* True if the suffix is for a polynomial type.  */
> +  unsigned int poly_p : 1;
> +  unsigned int spare : 12;
>
>/* The associated vector or predicate mode.  */
>machine_mode vector_mode : 16;
> --
> 2.34.1
>


Re: [pushed] analyzer: fix ICE in text art strings support

2023-08-28 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 25 Aug 2023 at 18:15, David Malcolm via Gcc-patches
 wrote:
>
> Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
> Pushed to trunk as r14-3481-g99a3fcb8ff0bf2.
Hi David,
It seems the new tests FAIL on arm for LTO bootstrap config:
https://ci.linaro.org/job/tcwg_bootstrap_check--master-arm-check_bootstrap_lto-build/263/artifact/artifacts/06-check_regression/fails.sum/*view*/
Please let me know if you need any help in reproducing these failures.

Thanks,
Prathamesh
>
> gcc/analyzer/ChangeLog:
> * access-diagram.cc (class string_region_spatial_item): Remove
> assumption that the string is written to the start of the cluster.
>
> gcc/testsuite/ChangeLog:
> * gcc.dg/analyzer/out-of-bounds-diagram-17.c: New test.
> * gcc.dg/analyzer/out-of-bounds-diagram-18.c: New test.
> * gcc.dg/analyzer/out-of-bounds-diagram-19.c: New test.
> ---
>  gcc/analyzer/access-diagram.cc| 57 ---
>  .../analyzer/out-of-bounds-diagram-17.c   | 34 +++
>  .../analyzer/out-of-bounds-diagram-18.c   | 38 +
>  .../analyzer/out-of-bounds-diagram-19.c   | 45 +++
>  4 files changed, 155 insertions(+), 19 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/analyzer/out-of-bounds-diagram-17.c
>  create mode 100644 gcc/testsuite/gcc.dg/analyzer/out-of-bounds-diagram-18.c
>  create mode 100644 gcc/testsuite/gcc.dg/analyzer/out-of-bounds-diagram-19.c
>
> diff --git a/gcc/analyzer/access-diagram.cc b/gcc/analyzer/access-diagram.cc
> index d7b669a4e38e..a51d594b5b2c 100644
> --- a/gcc/analyzer/access-diagram.cc
> +++ b/gcc/analyzer/access-diagram.cc
> @@ -1509,10 +1509,16 @@ public:
>out.add_all_bytes_in_range (m_actual_bits);
>  else
>{
> -   byte_range head_of_string (0, m_ellipsis_head_len);
> +   byte_range bytes (0, 0);
> +   bool valid = m_actual_bits.as_concrete_byte_range ();
> +   gcc_assert (valid);
> +   byte_range head_of_string (bytes.get_start_byte_offset (),
> +  m_ellipsis_head_len);
> out.add_all_bytes_in_range (head_of_string);
> byte_range tail_of_string
> - (TREE_STRING_LENGTH (string_cst) - m_ellipsis_tail_len,
> + ((bytes.get_start_byte_offset ()
> +   + TREE_STRING_LENGTH (string_cst)
> +   - m_ellipsis_tail_len),
>m_ellipsis_tail_len);
> out.add_all_bytes_in_range (tail_of_string);
> /* Adding the above pair of ranges will also effectively add
> @@ -1535,11 +1541,14 @@ public:
>  tree string_cst = get_string_cst ();
>  if (m_show_full_string)
>{
> -   for (byte_offset_t byte_idx = bytes.get_start_byte_offset ();
> -   byte_idx < bytes.get_next_byte_offset ();
> -   byte_idx = byte_idx + 1)
> -add_column_for_byte (t, btm, sm, byte_idx,
> - byte_idx_table_y, byte_val_table_y);
> +   for (byte_offset_t byte_idx_within_cluster
> + = bytes.get_start_byte_offset ();
> +   byte_idx_within_cluster < bytes.get_next_byte_offset ();
> +   byte_idx_within_cluster = byte_idx_within_cluster + 1)
> +add_column_for_byte
> +  (t, btm, sm, byte_idx_within_cluster,
> +   byte_idx_within_cluster - bytes.get_start_byte_offset (),
> +   byte_idx_table_y, byte_val_table_y);
>
> if (m_show_utf8)
>  {
> @@ -1566,10 +1575,13 @@ public:
>  = decoded_char.m_start_byte - TREE_STRING_POINTER 
> (string_cst);
>byte_size_t size_in_bytes
>  = decoded_char.m_next_byte - decoded_char.m_start_byte;
> -  byte_range bytes (start_byte_idx, size_in_bytes);
> +  byte_range cluster_bytes_for_codepoint
> +(start_byte_idx + bytes.get_start_byte_offset (),
> + size_in_bytes);
>
>const table::rect_t code_point_table_rect
> -= btm.get_table_rect (_string_reg, bytes,
> += btm.get_table_rect (_string_reg,
> +  cluster_bytes_for_codepoint,
>utf8_code_point_table_y, 1);
>char buf[100];
>sprintf (buf, "U+%04x", decoded_char.m_ch);
> @@ -1579,7 +1591,8 @@ public:
>if (show_unichars)
>  {
>const table::rect_t character_table_rect
> -= btm.get_table_rect (_string_reg, bytes,
> += btm.get_table_rect (_string_reg,
> +  cluster_bytes_for_codepoint,
>utf8_character_table_y, 1);
>if (cpp_is_printable_char (decoded_char.m_ch))
>  t.set_cell_span (character_table_rect,
> @@ -1598,12 +1611,14 @@ public:
>{
> /* Head of string.  */
> for (int byte_idx = 0; byte_idx 

Re: [PATCH] testsuite: aarch64: Adjust SVE ACLE tests to new generated code

2023-08-24 Thread Prathamesh Kulkarni via Gcc-patches
On Thu, 24 Aug 2023 at 08:27, Thiago Jung Bauermann
 wrote:
>
> Since commit e7a36e4715c7 "[PATCH] RISC-V: Support simplify (-1-x) for
> vector." these tests fail on aarch64-linux:
>
> === g++ tests ===
>
> Running g++:g++.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp ...
> FAIL: gcc.target/aarch64/sve/acle/asm/subr_s8.c -std=gnu++98 -O2 
> -fno-schedule-insns -DCHECK_ASM --save-temps -DTEST_FULL  
> check-function-bodies subr_m1_s8_m
> FAIL: gcc.target/aarch64/sve/acle/asm/subr_s8.c -std=gnu++98 -O2 
> -fno-schedule-insns -DCHECK_ASM --save-temps -DTEST_OVERLOADS  
> check-function-bodies subr_m1_s8_m
> FAIL: gcc.target/aarch64/sve/acle/asm/subr_u8.c -std=gnu++98 -O2 
> -fno-schedule-insns -DCHECK_ASM --save-temps -DTEST_FULL  
> check-function-bodies subr_m1_u8_m
> FAIL: gcc.target/aarch64/sve/acle/asm/subr_u8.c -std=gnu++98 -O2 
> -fno-schedule-insns -DCHECK_ASM --save-temps -DTEST_OVERLOADS  
> check-function-bodies subr_m1_u8_m
>
> === gcc tests ===
>
> Running gcc:gcc.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp ...
> FAIL: gcc.target/aarch64/sve/acle/asm/subr_s8.c -std=gnu90 -O2 
> -fno-schedule-insns -DCHECK_ASM --save-temps -DTEST_FULL  
> check-function-bodies subr_m1_s8_m
> FAIL: gcc.target/aarch64/sve/acle/asm/subr_s8.c -std=gnu90 -O2 
> -fno-schedule-insns -DCHECK_ASM --save-temps -DTEST_OVERLOADS  
> check-function-bodies subr_m1_s8_m
> FAIL: gcc.target/aarch64/sve/acle/asm/subr_u8.c -std=gnu90 -O2 
> -fno-schedule-insns -DCHECK_ASM --save-temps -DTEST_FULL  
> check-function-bodies subr_m1_u8_m
> FAIL: gcc.target/aarch64/sve/acle/asm/subr_u8.c -std=gnu90 -O2 
> -fno-schedule-insns -DCHECK_ASM --save-temps -DTEST_OVERLOADS  
> check-function-bodies subr_m1_u8_m
>
> Andrew Pinski's analysis in PR testsuite/111071 is that the new code is
> better and the testcase should be updated. I also asked Prathamesh Kulkarni
> in private and he agreed.
>
> Here is the update. With this change, all tests in
> gcc.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp pass on aarch64-linux.
>
> gcc/testsuite/
> PR testsuite/111071
> * gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s8.c: Adjust to 
> new code.
> * gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u8.c: Likewise.
>
> Suggested-by: Andrew Pinski 
> ---
>  gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s8.c | 3 +--
>  gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u8.c | 3 +--
>  2 files changed, 2 insertions(+), 4 deletions(-)
Hi Thiago,
The patch looks OK to me, but can't approve.

Thanks,
Prathamesh
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s8.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s8.c
> index b9615de6655f..3e521bc9ae32 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s8.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s8.c
> @@ -76,8 +76,7 @@ TEST_UNIFORM_Z (subr_1_s8_m_untied, svint8_t,
>
>  /*
>  ** subr_m1_s8_m:
> -** mov (z[0-9]+\.b), #-1
> -** subrz0\.b, p0/m, z0\.b, \1
> +** not z0\.b, p0/m, z0\.b
>  ** ret
>  */
>  TEST_UNIFORM_Z (subr_m1_s8_m, svint8_t,
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u8.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u8.c
> index 65606b6dda03..4922bdbacc47 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u8.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u8.c
> @@ -76,8 +76,7 @@ TEST_UNIFORM_Z (subr_1_u8_m_untied, svuint8_t,
>
>  /*
>  ** subr_m1_u8_m:
> -** mov (z[0-9]+\.b), #-1
> -** subrz0\.b, p0/m, z0\.b, \1
> +** not z0\.b, p0/m, z0\.b
>  ** ret
>  */
>  TEST_UNIFORM_Z (subr_m1_u8_m, svuint8_t,


Re: [PATCH] RISC-V: Support simplify (-1-x) for vector.

2023-08-21 Thread Prathamesh Kulkarni via Gcc-patches
On Wed, 16 Aug 2023 at 14:12, yanzhang.wang--- via Gcc-patches
 wrote:
>
> From: Yanzhang Wang 
>
> The pattern is enabled for scalar but not for vector. The patch try to
> make it consistent and will convert below code,
(CCing Richard S.)
Hi,
Sorry if this comment is not relevant to the patch but I was wondering if it
should also fold -1 - x --> ~x for the following test or is the test
written incorrectly ?

svint32_t f(svint32_t x)
{
  return svsub_s32_x (svptrue_b8 (), svdup_s32 (-1), x);
}

expand dump shows:
(insn 2 4 3 2 (set (reg/v:VNx4SI 93 [ x ])
(reg:VNx4SI 32 v0 [ x ])) "foo.c":9:1 -1
 (nil))
(note 3 2 6 2 NOTE_INSN_FUNCTION_BEG)
(insn 6 3 7 2 (set (reg:VNx4SI 94)
(const_vector:VNx4SI repeat [
(const_int -1 [0x])
])) "foo.c":10:10 -1
 (nil))
(insn 7 6 11 2 (set (reg:VNx4SI 92 [  ])
(minus:VNx4SI (reg:VNx4SI 94)
(reg/v:VNx4SI 93 [ x ]))) "foo.c":10:10 -1
 (nil))
(insn 11 7 12 2 (set (reg/i:VNx4SI 32 v0)
(reg:VNx4SI 92 [  ])) "foo.c":11:1 -1
 (nil))
(insn 12 11 0 2 (use (reg/i:VNx4SI 32 v0)) "foo.c":11:1 -1
 (nil))

and results in following code-gen:
f:
mov z31.b, #-1
sub z0.s, z31.s, z0.s
ret

Altho I suppose at TREE level the above call to svsub_s32_x could be folded by
implementing the same transform (-1 - x -> ~x) in svsub_impl::fold ?

Thanks,
Prathamesh




>
> shortcut_for_riscv_vrsub_case_1_32:
> vl1re32.v   v1,0(a1)
> vsetvli zero,a2,e32,m1,ta,ma
> vrsub.viv1,v1,-1
> vs1r.v  v1,0(a0)
> ret
>
> to,
>
> shortcut_for_riscv_vrsub_case_1_32:
> vl1re32.v   v1,0(a1)
> vsetvli zero,a2,e32,m1,ta,ma
> vnot.v  v1,v1
> vs1r.v  v1,0(a0)
> ret
>
> gcc/ChangeLog:
>
> * simplify-rtx.cc (simplify_context::simplify_binary_operation_1):
> Get -1 with mode.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/base/simplify-vrsub.c: New test.
>
> Signed-off-by: Yanzhang Wang 
> ---
>  gcc/simplify-rtx.cc|  2 +-
>  .../gcc.target/riscv/rvv/base/simplify-vrsub.c | 18 ++
>  2 files changed, 19 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/simplify-vrsub.c
>
> diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
> index d7315d82aa3..eb1ac120832 100644
> --- a/gcc/simplify-rtx.cc
> +++ b/gcc/simplify-rtx.cc
> @@ -3071,7 +3071,7 @@ simplify_context::simplify_binary_operation_1 (rtx_code 
> code,
>/* (-1 - a) is ~a, unless the expression contains symbolic
>  constants, in which case not retaining additions and
>  subtractions could cause invalid assembly to be produced.  */
> -  if (trueop0 == constm1_rtx
> +  if (trueop0 == CONSTM1_RTX (mode)
>   && !contains_symbolic_reference_p (op1))
> return simplify_gen_unary (NOT, mode, op1, mode);
>
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/simplify-vrsub.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/base/simplify-vrsub.c
> new file mode 100644
> index 000..df87ed94ea4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/simplify-vrsub.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64 -O3 -Wno-psabi" } */
> +
> +#include "riscv_vector.h"
> +
> +#define VRSUB_WITH_LMUL(LMUL, DTYPE)\
> +  vint##DTYPE##m##LMUL##_t  \
> +  shortcut_for_riscv_vrsub_case_##LMUL##_##DTYPE\
> +  (vint##DTYPE##m##LMUL##_t v1, \
> +   size_t vl)   \
> +  { \
> +return __riscv_vrsub_vx_i##DTYPE##m##LMUL (v1, -1, vl); \
> +  }
> +
> +VRSUB_WITH_LMUL (1, 16)
> +VRSUB_WITH_LMUL (1, 32)
> +
> +/* { dg-final { scan-assembler-times {vnot\.v} 2 } } */
> --
> 2.41.0
>


Re: [PATCH][RFC] tree-optimization/92335 - Improve sinking heuristics for vectorization

2023-08-21 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 21 Aug 2023 at 12:27, Richard Biener  wrote:
>
> On Sat, 19 Aug 2023, Prathamesh Kulkarni wrote:
>
> > On Fri, 18 Aug 2023 at 17:11, Richard Biener  wrote:
> > >
> > > On Fri, 18 Aug 2023, Richard Biener wrote:
> > >
> > > > On Thu, 17 Aug 2023, Prathamesh Kulkarni wrote:
> > > >
> > > > > On Tue, 15 Aug 2023 at 14:28, Richard Sandiford
> > > > >  wrote:
> > > > > >
> > > > > > Richard Biener  writes:
> > > > > > > On Mon, 14 Aug 2023, Prathamesh Kulkarni wrote:
> > > > > > >> On Mon, 7 Aug 2023 at 13:19, Richard Biener 
> > > > > > >>  wrote:
> > > > > > >> > It doesn't seem to make a difference for x86.  That said, the 
> > > > > > >> > "fix" is
> > > > > > >> > probably sticking the correct target on the dump-check, it 
> > > > > > >> > seems
> > > > > > >> > that vect_fold_extract_last is no longer correct here.
> > > > > > >> Um sorry, I did go thru various checks in target-supports.exp, 
> > > > > > >> but not
> > > > > > >> sure which one will be appropriate for this case,
> > > > > > >> and am stuck here :/ Could you please suggest how to proceed ?
> > > > > > >
> > > > > > > Maybe Richard S. knows the magic thing to test, he originally
> > > > > > > implemented the direct conversion support.  I suggest to implement
> > > > > > > such dg-checks if they are not present (I can't find them),
> > > > > > > possibly quite specific to the modes involved (like we have
> > > > > > > other checks with _qi_to_hi suffixes, for float modes maybe
> > > > > > > just _float).
> > > > > >
> > > > > > Yeah, can't remember specific selectors for that feature.  TBH I 
> > > > > > think
> > > > > > most (all?) of the tests were AArch64-specific.
> > > > > Hi,
> > > > > As Richi mentioned above, the test now vectorizes on AArch64 because
> > > > > it has support for direct conversion
> > > > > between vectors while x86 doesn't. IIUC this is because
> > > > > supportable_convert_operation returns true
> > > > > for V4HI -> V4SI on Aarch64 since it can use extend_v4hiv4si2 for
> > > > > doing the conversion ?
> > > > >
> > > > > In the attached patch, I added a new target check vect_extend which
> > > > > (currently) returns 1 only for aarch64*-*-*,
> > > > > which makes the test PASS on both the targets, altho I am not sure if
> > > > > this is entirely correct.
> > > > > Does the patch look OK ?
> > > >
> > > > Can you make vect_extend more specific, say vect_extend_hi_si or
> > > > what is specifically needed here?  Note I'll have to investigate
> > > > why x86 cannot vectorize here since in fact it does have
> > > > the extend operation ... it might be also worth splitting the
> > > > sign/zero extend case, so - vect_sign_extend_hi_si or
> > > > vect_extend_short_int?
> > >
> > > And now having anaylzed _why_ x86 doesn't vectorize it's rather
> > > why we get this vectorized with NEON which is because
> > >
> > > static opt_machine_mode
> > > aarch64_vectorize_related_mode (machine_mode vector_mode,
> > > scalar_mode element_mode,
> > > poly_uint64 nunits)
> > > {
> > > ...
> > >   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
> > >   if (TARGET_SIMD
> > >   && (vec_flags & VEC_ADVSIMD)
> > >   && known_eq (nunits, 0U)
> > >   && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
> > >   && maybe_ge (GET_MODE_BITSIZE (element_mode)
> > >* GET_MODE_NUNITS (vector_mode), 128U))
> > > {
> > >   machine_mode res = aarch64_simd_container_mode (element_mode, 128);
> > >   if (VECTOR_MODE_P (res))
> > > return res;
> > >
> > > which makes us get a V4SImode vector for a V4HImode loop vector_mode.
> > Thanks for the explanation!
> > >
> > > So I think the appropriate effective dejagnu target is
> > > aarch64-*-* (there's none specifically to advsimd, not sure if one
> > > can disable that?)
> > The attached patch uses aarch64*-*-* target check, and additionally
> > for SVE (and other targets supporting vect_fold_extract_last) it
> > checks
> > if the condition reduction was carried out using FOLD_EXTRACT_LAST.
> > Does that look OK ?
>
> Works for me.
Thanks, committed to trunk in dd606dc7c7e49feb7a900902ec6d35b421789173

Thanks,
Prathamesh
>
> Richard.
>
> > Thanks,
> > Prathamesh
> > >
> >
> > > Richard.
> > >
> > > > > Thanks,
> > > > > Prathamesh
> > > > > >
> > > > > > Thanks,
> > > > > > Richard
> > > > >
> > > >
> > > >
> > >
> > > --
> > > Richard Biener 
> > > SUSE Software Solutions Germany GmbH,
> > > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
> >
>
> --
> Richard Biener 
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)


Re: [PATCH] tree-optimization/111048 - avoid flawed logic in fold_vec_perm

2023-08-21 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 21 Aug 2023 at 12:26, Richard Biener  wrote:
>
> On Sat, 19 Aug 2023, Prathamesh Kulkarni wrote:
>
> > On Fri, 18 Aug 2023 at 14:52, Richard Biener  wrote:
> > >
> > > On Fri, 18 Aug 2023, Richard Sandiford wrote:
> > >
> > > > Richard Biener  writes:
> > > > > The following avoids running into somehow flawed logic in 
> > > > > fold_vec_perm
> > > > > for non-VLA vectors.
> > > > >
> > > > > Bootstrap & regtest running on x86_64-unknown-linux-gnu.
> > > > >
> > > > > Richard.
> > > > >
> > > > > PR tree-optimization/111048
> > > > > * fold-const.cc (fold_vec_perm_cst): Check for non-VLA
> > > > > vectors first.
> > > > >
> > > > > * gcc.dg/torture/pr111048.c: New testcase.
> > > >
> > > > Please don't do this as a permanent thing.  It was a deliberate choice
> > > > to have the is_constant be the fallback, so that the "generic" (VLA+VLS)
> > > > logic gets more coverage.  Like you say, if something is wrong for VLS
> > > > then the chances are that it's also wrong for VLA.
> > >
> > > Sure, feel free to undo this change together with the fix for the
> > > VLA case.
> > Hi,
> > The attached patch reverts the workaround, and fixes the issue.
> > Bootstrapped+tested on aarch64-linux-gnu with and without SVE, and
> > x64_64-linux-gnu.
> > OK to commit ?
>
> OK.
Thanks, committed to trunk in 649388462e9a3c2de0b90ce525de8044704cc521

Thanks,
Prathamesh
>
> > Thanks,
> > Prathamesh
> > >
> > > Richard.
> > >
> > > > Thanks,
> > > > Richard
> > > >
> > > >
> > > > > ---
> > > > >  gcc/fold-const.cc   | 12 ++--
> > > > >  gcc/testsuite/gcc.dg/torture/pr111048.c | 24 
> > > > >  2 files changed, 30 insertions(+), 6 deletions(-)
> > > > >  create mode 100644 gcc/testsuite/gcc.dg/torture/pr111048.c
> > > > >
> > > > > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> > > > > index 5c51c9d91be..144fd7481b3 100644
> > > > > --- a/gcc/fold-const.cc
> > > > > +++ b/gcc/fold-const.cc
> > > > > @@ -10625,6 +10625,11 @@ fold_vec_perm_cst (tree type, tree arg0, 
> > > > > tree arg1, const vec_perm_indices ,
> > > > >unsigned res_npatterns, res_nelts_per_pattern;
> > > > >unsigned HOST_WIDE_INT res_nelts;
> > > > >
> > > > > +  if (TYPE_VECTOR_SUBPARTS (type).is_constant (_nelts))
> > > > > +{
> > > > > +  res_npatterns = res_nelts;
> > > > > +  res_nelts_per_pattern = 1;
> > > > > +}
> > > > >/* (1) If SEL is a suitable mask as determined by
> > > > >   valid_mask_for_fold_vec_perm_cst_p, then:
> > > > >   res_npatterns = max of npatterns between ARG0, ARG1, and SEL
> > > > > @@ -10634,7 +10639,7 @@ fold_vec_perm_cst (tree type, tree arg0, tree 
> > > > > arg1, const vec_perm_indices ,
> > > > >   res_npatterns = nelts in result vector.
> > > > >   res_nelts_per_pattern = 1.
> > > > >   This exception is made so that VLS ARG0, ARG1 and SEL work as 
> > > > > before.  */
> > > > > -  if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, reason))
> > > > > +  else if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, 
> > > > > reason))
> > > > >  {
> > > > >res_npatterns
> > > > > = std::max (VECTOR_CST_NPATTERNS (arg0),
> > > > > @@ -10648,11 +10653,6 @@ fold_vec_perm_cst (tree type, tree arg0, 
> > > > > tree arg1, const vec_perm_indices ,
> > > > >
> > > > >res_nelts = res_npatterns * res_nelts_per_pattern;
> > > > >  }
> > > > > -  else if (TYPE_VECTOR_SUBPARTS (type).is_constant (_nelts))
> > > > > -{
> > > > > -  res_npatterns = res_nelts;
> > > > > -  res_nelts_per_pattern = 1;
> > > > > -}
> > > > >else
> > > > >  return NULL_TREE;
> > > > >
> > > > > diff --git a/gcc/testsuite/gcc.dg/torture/pr111048.c 
> > > > > b/g

Re: [PATCH][RFC] tree-optimization/92335 - Improve sinking heuristics for vectorization

2023-08-19 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 18 Aug 2023 at 17:11, Richard Biener  wrote:
>
> On Fri, 18 Aug 2023, Richard Biener wrote:
>
> > On Thu, 17 Aug 2023, Prathamesh Kulkarni wrote:
> >
> > > On Tue, 15 Aug 2023 at 14:28, Richard Sandiford
> > >  wrote:
> > > >
> > > > Richard Biener  writes:
> > > > > On Mon, 14 Aug 2023, Prathamesh Kulkarni wrote:
> > > > >> On Mon, 7 Aug 2023 at 13:19, Richard Biener 
> > > > >>  wrote:
> > > > >> > It doesn't seem to make a difference for x86.  That said, the 
> > > > >> > "fix" is
> > > > >> > probably sticking the correct target on the dump-check, it seems
> > > > >> > that vect_fold_extract_last is no longer correct here.
> > > > >> Um sorry, I did go thru various checks in target-supports.exp, but 
> > > > >> not
> > > > >> sure which one will be appropriate for this case,
> > > > >> and am stuck here :/ Could you please suggest how to proceed ?
> > > > >
> > > > > Maybe Richard S. knows the magic thing to test, he originally
> > > > > implemented the direct conversion support.  I suggest to implement
> > > > > such dg-checks if they are not present (I can't find them),
> > > > > possibly quite specific to the modes involved (like we have
> > > > > other checks with _qi_to_hi suffixes, for float modes maybe
> > > > > just _float).
> > > >
> > > > Yeah, can't remember specific selectors for that feature.  TBH I think
> > > > most (all?) of the tests were AArch64-specific.
> > > Hi,
> > > As Richi mentioned above, the test now vectorizes on AArch64 because
> > > it has support for direct conversion
> > > between vectors while x86 doesn't. IIUC this is because
> > > supportable_convert_operation returns true
> > > for V4HI -> V4SI on Aarch64 since it can use extend_v4hiv4si2 for
> > > doing the conversion ?
> > >
> > > In the attached patch, I added a new target check vect_extend which
> > > (currently) returns 1 only for aarch64*-*-*,
> > > which makes the test PASS on both the targets, altho I am not sure if
> > > this is entirely correct.
> > > Does the patch look OK ?
> >
> > Can you make vect_extend more specific, say vect_extend_hi_si or
> > what is specifically needed here?  Note I'll have to investigate
> > why x86 cannot vectorize here since in fact it does have
> > the extend operation ... it might be also worth splitting the
> > sign/zero extend case, so - vect_sign_extend_hi_si or
> > vect_extend_short_int?
>
> And now having anaylzed _why_ x86 doesn't vectorize it's rather
> why we get this vectorized with NEON which is because
>
> static opt_machine_mode
> aarch64_vectorize_related_mode (machine_mode vector_mode,
> scalar_mode element_mode,
> poly_uint64 nunits)
> {
> ...
>   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
>   if (TARGET_SIMD
>   && (vec_flags & VEC_ADVSIMD)
>   && known_eq (nunits, 0U)
>   && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
>   && maybe_ge (GET_MODE_BITSIZE (element_mode)
>* GET_MODE_NUNITS (vector_mode), 128U))
> {
>   machine_mode res = aarch64_simd_container_mode (element_mode, 128);
>   if (VECTOR_MODE_P (res))
> return res;
>
> which makes us get a V4SImode vector for a V4HImode loop vector_mode.
Thanks for the explanation!
>
> So I think the appropriate effective dejagnu target is
> aarch64-*-* (there's none specifically to advsimd, not sure if one
> can disable that?)
The attached patch uses aarch64*-*-* target check, and additionally
for SVE (and other targets supporting vect_fold_extract_last) it
checks
if the condition reduction was carried out using FOLD_EXTRACT_LAST.
Does that look OK ?

Thanks,
Prathamesh
>

> Richard.
>
> > > Thanks,
> > > Prathamesh
> > > >
> > > > Thanks,
> > > > Richard
> > >
> >
> >
>
> --
> Richard Biener 
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-7.c 
b/gcc/testsuite/gcc.dg/vect/pr65947-7.c
index 16cdcd1c6eb..58c46df5c54 100644
--- a/gcc/testsuite/gcc.dg/vect/pr65947-7.c
+++ b/gcc/testsuite/gcc.dg/vect/pr65947-7.c
@@ -52,5 +52,5 @@ main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target 
vect_fold_extract_last } } } */
-/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" { target { ! 
vect_fold_extract_last } } } } */
+/* { dg-final { scan-tree-dump "optimizing condition reduction with 
FOLD_EXTRACT_LAST" "vect" { target vect_fold_extract_last } } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target aarch64*-*-* 
} } } */


Re: [PATCH] tree-optimization/111048 - avoid flawed logic in fold_vec_perm

2023-08-19 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 18 Aug 2023 at 14:52, Richard Biener  wrote:
>
> On Fri, 18 Aug 2023, Richard Sandiford wrote:
>
> > Richard Biener  writes:
> > > The following avoids running into somehow flawed logic in fold_vec_perm
> > > for non-VLA vectors.
> > >
> > > Bootstrap & regtest running on x86_64-unknown-linux-gnu.
> > >
> > > Richard.
> > >
> > > PR tree-optimization/111048
> > > * fold-const.cc (fold_vec_perm_cst): Check for non-VLA
> > > vectors first.
> > >
> > > * gcc.dg/torture/pr111048.c: New testcase.
> >
> > Please don't do this as a permanent thing.  It was a deliberate choice
> > to have the is_constant be the fallback, so that the "generic" (VLA+VLS)
> > logic gets more coverage.  Like you say, if something is wrong for VLS
> > then the chances are that it's also wrong for VLA.
>
> Sure, feel free to undo this change together with the fix for the
> VLA case.
Hi,
The attached patch reverts the workaround, and fixes the issue.
Bootstrapped+tested on aarch64-linux-gnu with and without SVE, and
x64_64-linux-gnu.
OK to commit ?

Thanks,
Prathamesh
>
> Richard.
>
> > Thanks,
> > Richard
> >
> >
> > > ---
> > >  gcc/fold-const.cc   | 12 ++--
> > >  gcc/testsuite/gcc.dg/torture/pr111048.c | 24 
> > >  2 files changed, 30 insertions(+), 6 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.dg/torture/pr111048.c
> > >
> > > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> > > index 5c51c9d91be..144fd7481b3 100644
> > > --- a/gcc/fold-const.cc
> > > +++ b/gcc/fold-const.cc
> > > @@ -10625,6 +10625,11 @@ fold_vec_perm_cst (tree type, tree arg0, tree 
> > > arg1, const vec_perm_indices ,
> > >unsigned res_npatterns, res_nelts_per_pattern;
> > >unsigned HOST_WIDE_INT res_nelts;
> > >
> > > +  if (TYPE_VECTOR_SUBPARTS (type).is_constant (_nelts))
> > > +{
> > > +  res_npatterns = res_nelts;
> > > +  res_nelts_per_pattern = 1;
> > > +}
> > >/* (1) If SEL is a suitable mask as determined by
> > >   valid_mask_for_fold_vec_perm_cst_p, then:
> > >   res_npatterns = max of npatterns between ARG0, ARG1, and SEL
> > > @@ -10634,7 +10639,7 @@ fold_vec_perm_cst (tree type, tree arg0, tree 
> > > arg1, const vec_perm_indices ,
> > >   res_npatterns = nelts in result vector.
> > >   res_nelts_per_pattern = 1.
> > >   This exception is made so that VLS ARG0, ARG1 and SEL work as 
> > > before.  */
> > > -  if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, reason))
> > > +  else if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, reason))
> > >  {
> > >res_npatterns
> > > = std::max (VECTOR_CST_NPATTERNS (arg0),
> > > @@ -10648,11 +10653,6 @@ fold_vec_perm_cst (tree type, tree arg0, tree 
> > > arg1, const vec_perm_indices ,
> > >
> > >res_nelts = res_npatterns * res_nelts_per_pattern;
> > >  }
> > > -  else if (TYPE_VECTOR_SUBPARTS (type).is_constant (_nelts))
> > > -{
> > > -  res_npatterns = res_nelts;
> > > -  res_nelts_per_pattern = 1;
> > > -}
> > >else
> > >  return NULL_TREE;
> > >
> > > diff --git a/gcc/testsuite/gcc.dg/torture/pr111048.c 
> > > b/gcc/testsuite/gcc.dg/torture/pr111048.c
> > > new file mode 100644
> > > index 000..475978aae2b
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/torture/pr111048.c
> > > @@ -0,0 +1,24 @@
> > > +/* { dg-do run } */
> > > +/* { dg-additional-options "-mavx2" { target avx2_runtime } } */
> > > +
> > > +typedef unsigned char u8;
> > > +
> > > +__attribute__((noipa))
> > > +static void check(const u8 * v) {
> > > +if (*v != 15) __builtin_trap();
> > > +}
> > > +
> > > +__attribute__((noipa))
> > > +static void bug(void) {
> > > +u8 in_lanes[32];
> > > +for (unsigned i = 0; i < 32; i += 2) {
> > > +  in_lanes[i + 0] = 0;
> > > +  in_lanes[i + 1] = ((u8)0xff) >> (i & 7);
> > > +}
> > > +
> > > +check(_lanes[13]);
> > > +  }
> > > +
> > > +int main() {
> > > +bug();
> > > +}
> >
>
> --
> Richard Biener 
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
PR111048: Set arg_npatterns correctly.

In valid_mask_for_fold_vec_perm_cst we set arg_npatterns always
to VECTOR_CST_NPATTERNS (arg0) because of (q1 & 0) == 0 in
following condition:

 /* Ensure that the stepped sequence always selects from the same
 input pattern.  */
  unsigned arg_npatterns
= ((q1 & 0) == 0) ? VECTOR_CST_NPATTERNS (arg0)
  : VECTOR_CST_NPATTERNS (arg1);

resulting in wrong code-gen issues.
The patch fixes this by changing the condition to (q1 & 1) == 0.

gcc/ChangeLog:
PR tree-optimization/111048
* fold-const.cc (valid_mask_for_fold_vec_perm_cst_p): Set arg_npatterns
correctly.
(fold_vec_perm_cst): Remove workaround and again call
valid_mask_fold_vec_perm_cst_p for both VLS and VLA vectors.

Re: [PATCH][RFC] tree-optimization/92335 - Improve sinking heuristics for vectorization

2023-08-17 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 15 Aug 2023 at 14:28, Richard Sandiford
 wrote:
>
> Richard Biener  writes:
> > On Mon, 14 Aug 2023, Prathamesh Kulkarni wrote:
> >> On Mon, 7 Aug 2023 at 13:19, Richard Biener  
> >> wrote:
> >> > It doesn't seem to make a difference for x86.  That said, the "fix" is
> >> > probably sticking the correct target on the dump-check, it seems
> >> > that vect_fold_extract_last is no longer correct here.
> >> Um sorry, I did go thru various checks in target-supports.exp, but not
> >> sure which one will be appropriate for this case,
> >> and am stuck here :/ Could you please suggest how to proceed ?
> >
> > Maybe Richard S. knows the magic thing to test, he originally
> > implemented the direct conversion support.  I suggest to implement
> > such dg-checks if they are not present (I can't find them),
> > possibly quite specific to the modes involved (like we have
> > other checks with _qi_to_hi suffixes, for float modes maybe
> > just _float).
>
> Yeah, can't remember specific selectors for that feature.  TBH I think
> most (all?) of the tests were AArch64-specific.
Hi,
As Richi mentioned above, the test now vectorizes on AArch64 because
it has support for direct conversion
between vectors while x86 doesn't. IIUC this is because
supportable_convert_operation returns true
for V4HI -> V4SI on Aarch64 since it can use extend_v4hiv4si2 for
doing the conversion ?

In the attached patch, I added a new target check vect_extend which
(currently) returns 1 only for aarch64*-*-*,
which makes the test PASS on both the targets, altho I am not sure if
this is entirely correct.
Does the patch look OK ?

Thanks,
Prathamesh
>
> Thanks,
> Richard
diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-7.c 
b/gcc/testsuite/gcc.dg/vect/pr65947-7.c
index 16cdcd1c6eb..c8623854af5 100644
--- a/gcc/testsuite/gcc.dg/vect/pr65947-7.c
+++ b/gcc/testsuite/gcc.dg/vect/pr65947-7.c
@@ -52,5 +52,4 @@ main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target 
vect_fold_extract_last } } } */
-/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" { target { ! 
vect_fold_extract_last } } } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target vect_extend } 
} } */
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 92b6f69730e..29ef64b84f3 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -7768,6 +7768,16 @@ proc check_effective_target_vect_unpack { } {
 || [istarget amdgcn*-*-*] }}]
 }
 
+# Return 1 if the target plus current options supports vector
+# conversion of chars (to shorts) and shorts (to ints), 0 otherwise.
+#
+# This won't change for different subtargets so cache the result.
+
+proc check_effective_target_vect_extend { } {
+return [check_cached_effective_target_indexed vect_extend {
+  expr { [istarget aarch64*-*-*]}}]
+}
+
 # Return 1 if the target plus current options does not guarantee
 # that its STACK_BOUNDARY is >= the reguired vector alignment.
 #


Re: [RFC] [v2] Extend fold_vec_perm to handle VLA vectors

2023-08-16 Thread Prathamesh Kulkarni via Gcc-patches
On Wed, 16 Aug 2023 at 15:21, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> >> Unfortunately, the patch regressed following tests on ppc64le and
> >> armhf respectively:
> >> gcc.target/powerpc/vec-perm-ctor.c scan-tree-dump-not optimized
> >> "VIEW_CONVERT_EXPR"
> >> gcc.dg/tree-ssa/forwprop-20.c scan-tree-dump-not forwprop1 "VEC_PERM_EXPR"
> >>
> >> This happens because of the change to vect_cst_ctor_array which
> >> removes handling of VECTOR_CST,
> >> and thus we return NULL_TREE for cases where VEC_PERM_EXPR has
> >> vector_cst, ctor input operands.
> >>
> >> For eg we fail to fold VEC_PERM_EXPR for the following test taken from
> >> forwprop-20.c:
> >> void f (double d, vecf* r)
> >> {
> >>   vecf x = { -d, 5 };
> >>   vecf y = {  1, 4 };
> >>   veci m = {  2, 0 };
> >>   *r = __builtin_shuffle (x, y, m); // { 1, -d }
> >> }
> >> because vect_cst_ctor_to_array will now return NULL_TREE for vector_cst 
> >> {1, 4}.
> >>
> >> The attached patch thus reverts the changes to vect_cst_ctor_to_array,
> >> which makes the tests pass again.
> >> I have put the patch for another round of bootstrap+test on the above
> >> targets (aarch64, aarch64-sve, x86_64, armhf, ppc64le).
> >> OK to commit if it passes ?
> > The patch now passes bootstrap+test on all these targets.
>
> OK, thanks.
Thanks a lot for the helpful reviews! Committed in:
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=a7dba4a1c05a76026d88d0b519cf83bff9a2

Thanks,
Prathamesh
>
> Richard


Re: [RFC] [v2] Extend fold_vec_perm to handle VLA vectors

2023-08-16 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 15 Aug 2023 at 16:59, Prathamesh Kulkarni
 wrote:
>
> On Mon, 14 Aug 2023 at 18:23, Richard Sandiford
>  wrote:
> >
> > Prathamesh Kulkarni  writes:
> > > On Thu, 10 Aug 2023 at 21:27, Richard Sandiford
> > >  wrote:
> > >>
> > >> Prathamesh Kulkarni  writes:
> > >> >> static bool
> > >> >> is_simple_vla_size (poly_uint64 size)
> > >> >> {
> > >> >>   if (size.is_constant ())
> > >> >> return false;
> > >> >>   for (int i = 1; i < ARRAY_SIZE (size.coeffs); ++i)
> > >> >> if (size[i] != (i <= 1 ? size[0] : 0))
> > >> > Just wondering is this should be (i == 1 ? size[0] : 0) since i is
> > >> > initialized to 1 ?
> > >>
> > >> Both work.  I prefer <= 1 because it doesn't depend on the micro
> > >> optimisation to start at coefficient 1.  In a theoretical 3-indeterminate
> > >> poly_int, we want the first 2 coefficients to be nonzero and the rest to
> > >> be zero.
> > >>
> > >> > IIUC, is_simple_vla_size should return true for polynomials of first
> > >> > degree and having same coeff like 4 + 4x ?
> > >>
> > >> FWIW, poly_int only supports first-degree polynomials at the moment.
> > >> coeffs>2 means there is more than one indeterminate, rather than a
> > >> higher power.
> > > Oh OK, thanks for the clarification.
> > >>
> > >> >>   return false;
> > >> >>   return true;
> > >> >> }
> > >> >>
> > >> >>
> > >> >>   FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
> > >> >> {
> > >> >>   auto nunits = GET_MODE_NUNITS (mode);
> > >> >>   if (!is_simple_vla_size (nunits))
> > >> >> continue;
> > >> >>   if (nunits[0] ...)
> > >> >> test_... (mode);
> > >> >>   ...
> > >> >>
> > >> >> }
> > >> >>
> > >> >> test_vnx4si_v4si and test_v4si_vnx4si look good.  But with the
> > >> >> loop structure above, I think we can apply the test_vnx4si and
> > >> >> test_vnx16qi to more cases.  So the classification isn't the
> > >> >> exact number of elements, but instead a limit.
> > >> >>
> > >> >> I think the nunits[0] conditions for test_vnx4si are as follows
> > >> >> (inspection only, so could be wrong):
> > >> >>
> > >> >> > +/* Test cases where result and input vectors are VNx4SI  */
> > >> >> > +
> > >> >> > +static void
> > >> >> > +test_vnx4si (machine_mode vmode)
> > >> >> > +{
> > >> >> > +  /* Case 1: mask = {0, ...} */
> > >> >> > +  {
> > >> >> > +tree arg0 = build_vec_cst_rand (vmode, 2, 3, 1);
> > >> >> > +tree arg1 = build_vec_cst_rand (vmode, 2, 3, 1);
> > >> >> > +poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> > >> >> > +
> > >> >> > +vec_perm_builder builder (len, 1, 1);
> > >> >> > +builder.quick_push (0);
> > >> >> > +vec_perm_indices sel (builder, 2, len);
> > >> >> > +tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, 
> > >> >> > sel);
> > >> >> > +
> > >> >> > +tree expected_res[] = { vector_cst_elt (res, 0) };
> > >> > This should be { vector_cst_elt (arg0, 0) }; will fix in next patch.
> > >> >> > +validate_res (1, 1, res, expected_res);
> > >> >> > +  }
> > >> >>
> > >> >> nunits[0] >= 2 (could be all nunits if the inputs had 
> > >> >> nelts_per_pattern==1,
> > >> >> which I think would be better)
> > >> > IIUC, the vectors that can be used for a particular test should have
> > >> > nunits[0] >= res_npatterns,
> > >> > where res_npatterns is as computed in fold_vec_perm_cst without the
> > >> > canonicalization ?
> > >> > For above test -- res_npatterns = max(2, max (2, 1)) == 2, so we
> > >> > require nunits[0] >= 2 ?
> > >> > Which implies 

Re: [RFC] [v2] Extend fold_vec_perm to handle VLA vectors

2023-08-15 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 14 Aug 2023 at 18:23, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Thu, 10 Aug 2023 at 21:27, Richard Sandiford
> >  wrote:
> >>
> >> Prathamesh Kulkarni  writes:
> >> >> static bool
> >> >> is_simple_vla_size (poly_uint64 size)
> >> >> {
> >> >>   if (size.is_constant ())
> >> >> return false;
> >> >>   for (int i = 1; i < ARRAY_SIZE (size.coeffs); ++i)
> >> >> if (size[i] != (i <= 1 ? size[0] : 0))
> >> > Just wondering is this should be (i == 1 ? size[0] : 0) since i is
> >> > initialized to 1 ?
> >>
> >> Both work.  I prefer <= 1 because it doesn't depend on the micro
> >> optimisation to start at coefficient 1.  In a theoretical 3-indeterminate
> >> poly_int, we want the first 2 coefficients to be nonzero and the rest to
> >> be zero.
> >>
> >> > IIUC, is_simple_vla_size should return true for polynomials of first
> >> > degree and having same coeff like 4 + 4x ?
> >>
> >> FWIW, poly_int only supports first-degree polynomials at the moment.
> >> coeffs>2 means there is more than one indeterminate, rather than a
> >> higher power.
> > Oh OK, thanks for the clarification.
> >>
> >> >>   return false;
> >> >>   return true;
> >> >> }
> >> >>
> >> >>
> >> >>   FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
> >> >> {
> >> >>   auto nunits = GET_MODE_NUNITS (mode);
> >> >>   if (!is_simple_vla_size (nunits))
> >> >> continue;
> >> >>   if (nunits[0] ...)
> >> >> test_... (mode);
> >> >>   ...
> >> >>
> >> >> }
> >> >>
> >> >> test_vnx4si_v4si and test_v4si_vnx4si look good.  But with the
> >> >> loop structure above, I think we can apply the test_vnx4si and
> >> >> test_vnx16qi to more cases.  So the classification isn't the
> >> >> exact number of elements, but instead a limit.
> >> >>
> >> >> I think the nunits[0] conditions for test_vnx4si are as follows
> >> >> (inspection only, so could be wrong):
> >> >>
> >> >> > +/* Test cases where result and input vectors are VNx4SI  */
> >> >> > +
> >> >> > +static void
> >> >> > +test_vnx4si (machine_mode vmode)
> >> >> > +{
> >> >> > +  /* Case 1: mask = {0, ...} */
> >> >> > +  {
> >> >> > +tree arg0 = build_vec_cst_rand (vmode, 2, 3, 1);
> >> >> > +tree arg1 = build_vec_cst_rand (vmode, 2, 3, 1);
> >> >> > +poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> >> >> > +
> >> >> > +vec_perm_builder builder (len, 1, 1);
> >> >> > +builder.quick_push (0);
> >> >> > +vec_perm_indices sel (builder, 2, len);
> >> >> > +tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
> >> >> > +
> >> >> > +tree expected_res[] = { vector_cst_elt (res, 0) };
> >> > This should be { vector_cst_elt (arg0, 0) }; will fix in next patch.
> >> >> > +validate_res (1, 1, res, expected_res);
> >> >> > +  }
> >> >>
> >> >> nunits[0] >= 2 (could be all nunits if the inputs had 
> >> >> nelts_per_pattern==1,
> >> >> which I think would be better)
> >> > IIUC, the vectors that can be used for a particular test should have
> >> > nunits[0] >= res_npatterns,
> >> > where res_npatterns is as computed in fold_vec_perm_cst without the
> >> > canonicalization ?
> >> > For above test -- res_npatterns = max(2, max (2, 1)) == 2, so we
> >> > require nunits[0] >= 2 ?
> >> > Which implies we can use above test for vectors with length 2 + 2x, 4 + 
> >> > 4x, etc.
> >>
> >> Right, that's what I meant.  With the inputs as they stand it has to be
> >> nunits[0] >= 2.  We need that form the inputs correctly.  But if the
> >> inputs instead had nelts_per_pattern == 1, the test would work for all
> >> nunits.
> > In the attached patch, I have reordered the tests based on min or max limit.
> > For tests where sel_npatterns &

Re: [pushed]LRA]: Fix asserts for output stack pointer reloads

2023-08-14 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 14 Aug 2023 at 06:39, Vladimir Makarov via Gcc-patches
 wrote:
>
> The following patch fixes useless asserts in my latest patch
> implementing output stack pointer reloads.
Hi Vladimir,
It seems that this patch caused the following ICE on aarch64-linux-gnu
while building cp-demangle.c:
compile:  
/home/prathamesh.kulkarni/gnu-toolchain/gcc/master/stage1-build/./gcc/xgcc
-B/home/prathamesh.kulkarni/gnu-toolchain/gcc/master/stage1-build/./gcc/
-B/usr/local/aarch64-unknown-linux-gnu/bin/
-B/usr/local/aarch64-unknown-linux-gnu/lib/ -isystem
/usr/local/aarch64-unknown-linux-gnu/include -isystem
/usr/local/aarch64-unknown-linux-gnu/sys-include -DHAVE_CONFIG_H -I..
-I/home/prathamesh.kulkarni/gnu-toolchain/gcc/master/gcc/libstdc++-v3/../libiberty
-I/home/prathamesh.kulkarni/gnu-toolchain/gcc/master/gcc/libstdc++-v3/../include
-D_GLIBCXX_SHARED
-I/home/prathamesh.kulkarni/gnu-toolchain/gcc/master/stage1-build/aarch64-unknown-linux-gnu/libstdc++-v3/include/aarch64-unknown-linux-gnu
-I/home/prathamesh.kulkarni/gnu-toolchain/gcc/master/stage1-build/aarch64-unknown-linux-gnu/libstdc++-v3/include
-I/home/prathamesh.kulkarni/gnu-toolchain/gcc/master/gcc/libstdc++-v3/libsupc++
-g -O2 -DIN_GLIBCPP_V3 -Wno-error -c cp-demangle.c  -fPIC -DPIC -o
cp-demangle.o
during RTL pass: reload
cp-demangle.c: In function ‘d_demangle_callback.constprop’:
cp-demangle.c:6815:1: internal compiler error: in curr_insn_transform,
at lra-constraints.cc:4854
 6815 | }
  | ^
0xce6b37 curr_insn_transform
../../gcc/gcc/lra-constraints.cc:4854
0xce7887 lra_constraints(bool)
../../gcc/gcc/lra-constraints.cc:5478
0xccdfa7 lra(_IO_FILE*)
../../gcc/gcc/lra.cc:2419
0xc7e417 do_reload
../../gcc/gcc/ira.cc:5970
0xc7e417 execute
../../gcc/gcc/ira.cc:6156
Please submit a full bug report, with preprocessed source (by using
-freport-bug).
Please include the complete backtrace with any bug report.

Thanks,
Prathamesh


Re: [PATCH][RFC] tree-optimization/92335 - Improve sinking heuristics for vectorization

2023-08-14 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 7 Aug 2023 at 13:19, Richard Biener  wrote:
>
> On Mon, Aug 7, 2023 at 2:05 AM Prathamesh Kulkarni via Gcc-patches
>  wrote:
> >
> > On Thu, 3 Aug 2023 at 17:48, Richard Biener  wrote:
> > >
> > > On Thu, 3 Aug 2023, Richard Biener wrote:
> > >
> > > > On Thu, 3 Aug 2023, Richard Biener wrote:
> > > >
> > > > > On Thu, 3 Aug 2023, Prathamesh Kulkarni wrote:
> > > > >
> > > > > > On Wed, 2 Aug 2023 at 14:17, Richard Biener via Gcc-patches
> > > > > >  wrote:
> > > > > > >
> > > > > > > On Mon, 31 Jul 2023, Jeff Law wrote:
> > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > On 7/28/23 01:05, Richard Biener via Gcc-patches wrote:
> > > > > > > > > The following delays sinking of loads within the same 
> > > > > > > > > innermost
> > > > > > > > > loop when it was unconditional before.  That's a not uncommon
> > > > > > > > > issue preventing vectorization when masked loads are not 
> > > > > > > > > available.
> > > > > > > > >
> > > > > > > > > Bootstrapped and tested on x86_64-unknown-linux-gnu.
> > > > > > > > >
> > > > > > > > > I have a followup patch improving sinking that without this 
> > > > > > > > > would
> > > > > > > > > cause more of the problematic sinking - now that we have a 
> > > > > > > > > second
> > > > > > > > > sink pass after loop opts this looks like a reasonable 
> > > > > > > > > approach?
> > > > > > > > >
> > > > > > > > > OK?
> > > > > > > > >
> > > > > > > > > Thanks,
> > > > > > > > > Richard.
> > > > > > > > >
> > > > > > > > >  PR tree-optimization/92335
> > > > > > > > >  * tree-ssa-sink.cc (select_best_block): Before loop
> > > > > > > > >  optimizations avoid sinking unconditional loads/stores
> > > > > > > > >  in innermost loops to conditional executed places.
> > > > > > > > >
> > > > > > > > >  * gcc.dg/tree-ssa/ssa-sink-10.c: Disable vectorizing.
> > > > > > > > >  * gcc.dg/tree-ssa/predcom-9.c: Clone from ssa-sink-10.c,
> > > > > > > > >  expect predictive commoning to happen instead of sinking.
> > > > > > > > >  * gcc.dg/vect/pr65947-3.c: Adjust.
> > > > > > > > I think it's reasonable -- there's probably going to be cases 
> > > > > > > > where it's not
> > > > > > > > great, but more often than not I think it's going to be a 
> > > > > > > > reasonable
> > > > > > > > heuristic.
> > > > > > > >
> > > > > > > > If there is undesirable fallout, better to find it over the 
> > > > > > > > coming months than
> > > > > > > > next spring.  So I'd suggest we go forward now to give more 
> > > > > > > > time to find any
> > > > > > > > pathological cases (if they exist).
> > > > > > >
> > > > > > > Agreed, I've pushed this now.
> > > > > > Hi Richard,
> > > > > > After this patch (committed in 
> > > > > > 399c8dd44ff44f4b496223c7cc980651c4d6f6a0),
> > > > > > pr65947-7.c "failed" for aarch64-linux-gnu:
> > > > > > FAIL: gcc.dg/vect/pr65947-7.c scan-tree-dump-not vect "LOOP 
> > > > > > VECTORIZED"
> > > > > > FAIL: gcc.dg/vect/pr65947-7.c -flto -ffat-lto-objects
> > > > > > scan-tree-dump-not vect "LOOP VECTORIZED"
> > > > > >
> > > > > > /* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" { 
> > > > > > target {
> > > > > > ! vect_fold_extract_last } } } } */
> > > > > >
> > > > > > With your commit, condition_reduction in pr65947-7.c gets vectorized
> > > > > > regardless of vect_fold_ext

Re: [RFC] [v2] Extend fold_vec_perm to handle VLA vectors

2023-08-13 Thread Prathamesh Kulkarni via Gcc-patches
On Thu, 10 Aug 2023 at 21:27, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> >> static bool
> >> is_simple_vla_size (poly_uint64 size)
> >> {
> >>   if (size.is_constant ())
> >> return false;
> >>   for (int i = 1; i < ARRAY_SIZE (size.coeffs); ++i)
> >> if (size[i] != (i <= 1 ? size[0] : 0))
> > Just wondering is this should be (i == 1 ? size[0] : 0) since i is
> > initialized to 1 ?
>
> Both work.  I prefer <= 1 because it doesn't depend on the micro
> optimisation to start at coefficient 1.  In a theoretical 3-indeterminate
> poly_int, we want the first 2 coefficients to be nonzero and the rest to
> be zero.
>
> > IIUC, is_simple_vla_size should return true for polynomials of first
> > degree and having same coeff like 4 + 4x ?
>
> FWIW, poly_int only supports first-degree polynomials at the moment.
> coeffs>2 means there is more than one indeterminate, rather than a
> higher power.
Oh OK, thanks for the clarification.
>
> >>   return false;
> >>   return true;
> >> }
> >>
> >>
> >>   FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
> >> {
> >>   auto nunits = GET_MODE_NUNITS (mode);
> >>   if (!is_simple_vla_size (nunits))
> >> continue;
> >>   if (nunits[0] ...)
> >> test_... (mode);
> >>   ...
> >>
> >> }
> >>
> >> test_vnx4si_v4si and test_v4si_vnx4si look good.  But with the
> >> loop structure above, I think we can apply the test_vnx4si and
> >> test_vnx16qi to more cases.  So the classification isn't the
> >> exact number of elements, but instead a limit.
> >>
> >> I think the nunits[0] conditions for test_vnx4si are as follows
> >> (inspection only, so could be wrong):
> >>
> >> > +/* Test cases where result and input vectors are VNx4SI  */
> >> > +
> >> > +static void
> >> > +test_vnx4si (machine_mode vmode)
> >> > +{
> >> > +  /* Case 1: mask = {0, ...} */
> >> > +  {
> >> > +tree arg0 = build_vec_cst_rand (vmode, 2, 3, 1);
> >> > +tree arg1 = build_vec_cst_rand (vmode, 2, 3, 1);
> >> > +poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> >> > +
> >> > +vec_perm_builder builder (len, 1, 1);
> >> > +builder.quick_push (0);
> >> > +vec_perm_indices sel (builder, 2, len);
> >> > +tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
> >> > +
> >> > +tree expected_res[] = { vector_cst_elt (res, 0) };
> > This should be { vector_cst_elt (arg0, 0) }; will fix in next patch.
> >> > +validate_res (1, 1, res, expected_res);
> >> > +  }
> >>
> >> nunits[0] >= 2 (could be all nunits if the inputs had nelts_per_pattern==1,
> >> which I think would be better)
> > IIUC, the vectors that can be used for a particular test should have
> > nunits[0] >= res_npatterns,
> > where res_npatterns is as computed in fold_vec_perm_cst without the
> > canonicalization ?
> > For above test -- res_npatterns = max(2, max (2, 1)) == 2, so we
> > require nunits[0] >= 2 ?
> > Which implies we can use above test for vectors with length 2 + 2x, 4 + 4x, 
> > etc.
>
> Right, that's what I meant.  With the inputs as they stand it has to be
> nunits[0] >= 2.  We need that form the inputs correctly.  But if the
> inputs instead had nelts_per_pattern == 1, the test would work for all
> nunits.
In the attached patch, I have reordered the tests based on min or max limit.
For tests where sel_npatterns < 3 (ie dup sequence), I have kept input
npatterns = 1,
so we can test more vector modes, and also input npatterns matter only
for stepped sequence in sel
(Since for a dup pattern we don't enforce the constraint of selecting
elements from same input pattern).
Does it look OK ?

For the following tests with input vectors having shape (1, 3)
sel = {0, 1, 2, ...}  // (1, 3)
res = { arg0[0], arg0[1], arg0[2], ... } // (1, 3)

and sel = {len, len + 1, len + 2, ... }  // (1, 3)
res = { arg1[0], arg1[1], arg1[2], ... } // (1, 3)

Altho res_npatterns = 1, I suppose these will need to be tested with
vectors with length >= 4 + 4x,
since index 2 can be ambiguous for length 2 + 2x  ?
(In the patch, these are cases 2 and 3 in test_nunits_min_4)

Patch is bootstrapped+tested on aarch64-linux-gnu with and without SVE
and on x86_64-linux-gnu
(altho I suppose bootstrapping won't be necessary for changes to unit-tests?)
>
> > Sorry if this sounds

Re: [RFC] [v2] Extend fold_vec_perm to handle VLA vectors

2023-08-10 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 8 Aug 2023 at 15:27, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Fri, 4 Aug 2023 at 20:36, Richard Sandiford
> >  wrote:
> >>
> >> Full review this time, sorry for the skipping the tests earlier.
> > Thanks for the detailed review! Please find my responses inline below.
> >>
> >> Prathamesh Kulkarni  writes:
> >> > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> >> > index 7e5494dfd39..680d0e54fd4 100644
> >> > --- a/gcc/fold-const.cc
> >> > +++ b/gcc/fold-const.cc
> >> > @@ -85,6 +85,10 @@ along with GCC; see the file COPYING3.  If not see
> >> >  #include "vec-perm-indices.h"
> >> >  #include "asan.h"
> >> >  #include "gimple-range.h"
> >> > +#include 
> >>
> >> This should be included by defining INCLUDE_ALGORITHM instead.
> > Done. Just curious, why do we use this macro instead of directly
> > including  ?
>
> AIUI, one of the reasons for having every file start with includes
> of config.h and (b)system.h, in that order, is to ensure that a small
> and predictable amount of GCC-specific stuff happens before including
> the system header files.  That helps to avoid OS-specific clashes between
> GCC code and system headers.
>
> But another major reason is that system.h ends by poisoning a lot of
> stuff that system headers would be entitled to use.
Ah OK, thanks for the clarification!
>
> >> > +  tree_vector_builder builder (vectype, npatterns, nelts_per_pattern);
> >> > +
> >> > +  // Fill a0 for each pattern
> >> > +  for (unsigned i = 0; i < npatterns; i++)
> >> > +builder.quick_push (build_int_cst (inner_type, rand () % 100));
> >> > +
> >> > +  if (nelts_per_pattern == 1)
> >> > +return builder.build ();
> >> > +
> >> > +  // Fill a1 for each pattern
> >> > +  for (unsigned i = 0; i < npatterns; i++)
> >> > +builder.quick_push (build_int_cst (inner_type, rand () % 100));
> >> > +
> >> > +  if (nelts_per_pattern == 2)
> >> > +return builder.build ();
> >> > +
> >> > +  for (unsigned i = npatterns * 2; i < npatterns * nelts_per_pattern; 
> >> > i++)
> >> > +{
> >> > +  tree prev_elem = builder[i - npatterns];
> >> > +  int prev_elem_val = TREE_INT_CST_LOW (prev_elem);
> >> > +  int val = prev_elem_val + S;
> >> > +  builder.quick_push (build_int_cst (inner_type, val));
> >> > +}
> >> > +
> >> > +  return builder.build ();
> >> > +}
> >> > +
> >> > +static void
> >> > +validate_res (unsigned npatterns, unsigned nelts_per_pattern,
> >> > +   tree res, tree *expected_res)
> >> > +{
> >> > +  ASSERT_TRUE (VECTOR_CST_NPATTERNS (res) == npatterns);
> >> > +  ASSERT_TRUE (VECTOR_CST_NELTS_PER_PATTERN (res) == nelts_per_pattern);
> >>
> >> I don't think this is safe when the inputs are randomised.  E.g. we
> >> could by chance end up with a vector of all zeros, which would have
> >> a single pattern and a single element per pattern, regardless of the
> >> shapes of the inputs.
> >>
> >> Given the way that vector_builder::finalize
> >> canonicalises the encoding, it should be safe to use:
> >>
> >> * VECTOR_CST_NPATTERNS (res) <= npatterns
> >> * vector_cst_encoded_nelts (res) <= npatterns * nelts_per_pattern
> >>
> >> If we do that then...
> >>
> >> > +
> >> > +  for (unsigned i = 0; i < vector_cst_encoded_nelts (res); i++)
> >>
> >> ...this loop bound should be npatterns * nelts_per_pattern instead.
> > Ah indeed. Fixed, thanks.
>
> The patch instead does:
>
>   ASSERT_TRUE (VECTOR_CST_NPATTERNS (res) <= npatterns);
>   ASSERT_TRUE (VECTOR_CST_NELTS_PER_PATTERN (res) <= nelts_per_pattern);
>
> I think the version I suggested is safer.  It's not the goal of the
> canonicalisation algorithm to reduce both npattners and nelts_per_pattern
> individually.  The algorithm can increase nelts_per_pattern in order
> to decrease npatterns.
Oops, sorry I misread, will fix in the next patch.
>
> >> > +  {
> >> > +tree arg0 = build_vec_cst_rand (integer_type_node, 1, 3, 2);
> >> > +tree arg1 = build_vec_cst_rand (integer_type_node, 1, 3, 2);
> >> > +poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TRE

Re: [PATCH][RFC] tree-optimization/92335 - Improve sinking heuristics for vectorization

2023-08-06 Thread Prathamesh Kulkarni via Gcc-patches
On Thu, 3 Aug 2023 at 17:48, Richard Biener  wrote:
>
> On Thu, 3 Aug 2023, Richard Biener wrote:
>
> > On Thu, 3 Aug 2023, Richard Biener wrote:
> >
> > > On Thu, 3 Aug 2023, Prathamesh Kulkarni wrote:
> > >
> > > > On Wed, 2 Aug 2023 at 14:17, Richard Biener via Gcc-patches
> > > >  wrote:
> > > > >
> > > > > On Mon, 31 Jul 2023, Jeff Law wrote:
> > > > >
> > > > > >
> > > > > >
> > > > > > On 7/28/23 01:05, Richard Biener via Gcc-patches wrote:
> > > > > > > The following delays sinking of loads within the same innermost
> > > > > > > loop when it was unconditional before.  That's a not uncommon
> > > > > > > issue preventing vectorization when masked loads are not 
> > > > > > > available.
> > > > > > >
> > > > > > > Bootstrapped and tested on x86_64-unknown-linux-gnu.
> > > > > > >
> > > > > > > I have a followup patch improving sinking that without this would
> > > > > > > cause more of the problematic sinking - now that we have a second
> > > > > > > sink pass after loop opts this looks like a reasonable approach?
> > > > > > >
> > > > > > > OK?
> > > > > > >
> > > > > > > Thanks,
> > > > > > > Richard.
> > > > > > >
> > > > > > >  PR tree-optimization/92335
> > > > > > >  * tree-ssa-sink.cc (select_best_block): Before loop
> > > > > > >  optimizations avoid sinking unconditional loads/stores
> > > > > > >  in innermost loops to conditional executed places.
> > > > > > >
> > > > > > >  * gcc.dg/tree-ssa/ssa-sink-10.c: Disable vectorizing.
> > > > > > >  * gcc.dg/tree-ssa/predcom-9.c: Clone from ssa-sink-10.c,
> > > > > > >  expect predictive commoning to happen instead of sinking.
> > > > > > >  * gcc.dg/vect/pr65947-3.c: Adjust.
> > > > > > I think it's reasonable -- there's probably going to be cases where 
> > > > > > it's not
> > > > > > great, but more often than not I think it's going to be a reasonable
> > > > > > heuristic.
> > > > > >
> > > > > > If there is undesirable fallout, better to find it over the coming 
> > > > > > months than
> > > > > > next spring.  So I'd suggest we go forward now to give more time to 
> > > > > > find any
> > > > > > pathological cases (if they exist).
> > > > >
> > > > > Agreed, I've pushed this now.
> > > > Hi Richard,
> > > > After this patch (committed in 
> > > > 399c8dd44ff44f4b496223c7cc980651c4d6f6a0),
> > > > pr65947-7.c "failed" for aarch64-linux-gnu:
> > > > FAIL: gcc.dg/vect/pr65947-7.c scan-tree-dump-not vect "LOOP VECTORIZED"
> > > > FAIL: gcc.dg/vect/pr65947-7.c -flto -ffat-lto-objects
> > > > scan-tree-dump-not vect "LOOP VECTORIZED"
> > > >
> > > > /* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" { target {
> > > > ! vect_fold_extract_last } } } } */
> > > >
> > > > With your commit, condition_reduction in pr65947-7.c gets vectorized
> > > > regardless of vect_fold_extract_last,
> > > > which gates the above test (which is an improvement, because the
> > > > function didn't get vectorized before the commit).
> > > >
> > > > The attached patch thus removes the gating on vect_fold_extract_last,
> > > > and the test passes again.
> > > > OK to commit ?
> > >
> > > OK.
> >
> > Or wait - the loop doesn't vectorize on x86_64, so I guess one
> > critical target condition is missing.  Can you figure out which?
>
> I see
>
> /space/rguenther/src/gcc/gcc/testsuite/gcc.dg/vect/pr65947-7.c:18:21:
> note:   vect_is_simple_use: operand last_19 = PHI ,
> type of def: reduction
> /space/rguenther/src/gcc/gcc/testsuite/gcc.dg/vect/pr65947-7.c:18:21:
> note:   vect_is_simple_use: vectype vector(4) int
> /space/rguenther/src/gcc/gcc/testsuite/gcc.dg/vect/pr65947-7.c:18:21:
> missed:   multiple types in double reduction or condition reduction or
> fold-left reduction.
> /space/rguenther/src/gcc/gcc/testsuite/gcc.dg/vect/pr6594

Re: [RFC] [v2] Extend fold_vec_perm to handle VLA vectors

2023-08-06 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 4 Aug 2023 at 20:36, Richard Sandiford
 wrote:
>
> Full review this time, sorry for the skipping the tests earlier.
Thanks for the detailed review! Please find my responses inline below.
>
> Prathamesh Kulkarni  writes:
> > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> > index 7e5494dfd39..680d0e54fd4 100644
> > --- a/gcc/fold-const.cc
> > +++ b/gcc/fold-const.cc
> > @@ -85,6 +85,10 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "vec-perm-indices.h"
> >  #include "asan.h"
> >  #include "gimple-range.h"
> > +#include 
>
> This should be included by defining INCLUDE_ALGORITHM instead.
Done. Just curious, why do we use this macro instead of directly
including  ?
>
> > +#include "tree-pretty-print.h"
> > +#include "gimple-pretty-print.h"
> > +#include "print-tree.h"
>
> Are these still needed, or were they for debugging?
Just for debugging, removed.
>
> >
> >  /* Nonzero if we are folding constants inside an initializer or a C++
> > manifestly-constant-evaluated context; zero otherwise.
> > @@ -10494,15 +10498,9 @@ fold_mult_zconjz (location_t loc, tree type, tree 
> > expr)
> >  static bool
> >  vec_cst_ctor_to_array (tree arg, unsigned int nelts, tree *elts)
> >  {
> > -  unsigned HOST_WIDE_INT i, nunits;
> > +  unsigned HOST_WIDE_INT i;
> >
> > -  if (TREE_CODE (arg) == VECTOR_CST
> > -  && VECTOR_CST_NELTS (arg).is_constant ())
> > -{
> > -  for (i = 0; i < nunits; ++i)
> > - elts[i] = VECTOR_CST_ELT (arg, i);
> > -}
> > -  else if (TREE_CODE (arg) == CONSTRUCTOR)
> > +  if (TREE_CODE (arg) == CONSTRUCTOR)
> >  {
> >constructor_elt *elt;
> >
> > @@ -10520,6 +10518,192 @@ vec_cst_ctor_to_array (tree arg, unsigned int 
> > nelts, tree *elts)
> >return true;
> >  }
> >
> > +/* Helper routine for fold_vec_perm_cst to check if SEL is a suitable
> > +   mask for VLA vec_perm folding.
> > +   REASON if specified, will contain the reason why SEL is not suitable.
> > +   Used only for debugging and unit-testing.
> > +   VERBOSE if enabled is used for debugging output.  */
> > +
> > +static bool
> > +valid_mask_for_fold_vec_perm_cst_p (tree arg0, tree arg1,
> > + const vec_perm_indices ,
> > + const char **reason = NULL,
> > + ATTRIBUTE_UNUSED bool verbose = false)
>
> Since verbose is no longer needed (good!), I think we should just remove it.
Done.
>
> > +{
> > +  unsigned sel_npatterns = sel.encoding ().npatterns ();
> > +  unsigned sel_nelts_per_pattern = sel.encoding ().nelts_per_pattern ();
> > +
> > +  if (!(pow2p_hwi (sel_npatterns)
> > + && pow2p_hwi (VECTOR_CST_NPATTERNS (arg0))
> > + && pow2p_hwi (VECTOR_CST_NPATTERNS (arg1
> > +{
> > +  if (reason)
> > + *reason = "npatterns is not power of 2";
> > +  return false;
> > +}
> > +
> > +  /* We want to avoid cases where sel.length is not a multiple of 
> > npatterns.
> > + For eg: sel.length = 2 + 2x, and sel npatterns = 4.  */
> > +  poly_uint64 esel;
> > +  if (!multiple_p (sel.length (), sel_npatterns, ))
> > +{
> > +  if (reason)
> > + *reason = "sel.length is not multiple of sel_npatterns";
> > +  return false;
> > +}
> > +
> > +  if (sel_nelts_per_pattern < 3)
> > +return true;
> > +
> > +  for (unsigned pattern = 0; pattern < sel_npatterns; pattern++)
> > +{
> > +  poly_uint64 a1 = sel[pattern + sel_npatterns];
> > +  poly_uint64 a2 = sel[pattern + 2 * sel_npatterns];
> > +  HOST_WIDE_INT S;
>
> Trailing whitespace.  The convention is to use lowercase variable
> names, so please call this "step".
Fixed, thanks.
>
> > +  if (!poly_int64 (a2 - a1).is_constant ())
> > + {
> > +   if (reason)
> > + *reason = "step is not constant";
> > +   return false;
> > + }
> > +  // FIXME: Punt on S < 0 for now, revisit later.
> > +  if (S < 0)
> > + return false;
> > +  if (S == 0)
> > + continue;
> > +
> > +  if (!pow2p_hwi (S))
> > + {
> > +   if (reason)
> > + *reason = "step is not power of 2";
> > +   return false;
> > + }
> > +
> > +  /*

Re: [PATCH] Add -Wdisabled-optimization warning for not optimizing sibling calls

2023-08-05 Thread Prathamesh Kulkarni via Gcc-patches
On Sun, 6 Aug 2023 at 03:07, Bradley Lucier  wrote:
>
> On 8/5/23 4:58 PM, Prathamesh Kulkarni wrote:
> > I don't have comments on the patch, but a new warning will also
> > require a corresponding entry in doc/invoke.texi.
>
> Thank you for your comment.
>
> -Wdisabled-optimization is an established warning, it's just that I'd
> like it to apply in another circumstance.  Maybe that doesn't need new
> documentation.
Oops I misread your patch as adding a new warning :/
Sorry for the noise.

Best Regards,
Prathamesh
>
> Brad Lucier


Re: [PATCH] Add -Wdisabled-optimization warning for not optimizing sibling calls

2023-08-05 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 4 Aug 2023 at 23:28, Bradley Lucier via Gcc-patches
 wrote:
>
> The patch at the end adds a warning when a tail/sibling call cannot be
> optimized for various reasons.
>
> I built and tested GCC with and without the patch with configuration
>
> Configured with: ../../gcc-mainline/configure --enable-languages=c
> --disable-multilib --prefix=/pkgs/gcc-mainline --disable-werror
>
> There were some changes in the test results, but I can't say that they
> look substantive:
>
> diff -C 2 summary.log ../gcc-mainline
> *** summary.log Thu Aug  3 22:56:13 2023
> --- ../gcc-mainline/summary.log Thu Aug  3 19:42:33 2023
> ***
> *** 14,22 
> === g++ Summary ===
>
> ! # of expected passes  239234
># of unexpected failures 5
># of expected failures   2087
> ! # of unsupported tests10566
> ! /home/lucier/programs/gcc/objdirs/gcc-mainline-new/gcc/xg++  version
> 14.0.0 20230802 (experimental) (GCC)
>
> === gcc tests ===
> --- 14,22 
> === g++ Summary ===
>
> ! # of expected passes  239262
># of unexpected failures 5
># of expected failures   2087
> ! # of unsupported tests10562
> ! /home/lucier/programs/gcc/objdirs/gcc-mainline/gcc/xg++  version
> 14.0.0 20230802 (experimental) (GCC)
>
> === gcc tests ===
> ***
> *** 155,164 
> === gcc Summary ===
>
> ! # of expected passes  192553
># of unexpected failures 109
># of unexpected successes19
># of expected failures   1506
> ! # of unsupported tests2623
> ! /home/lucier/programs/gcc/objdirs/gcc-mainline-new/gcc/xgcc  version
> 14.0.0 20230802 (experimental) (GCC)
>
> === libatomic tests ===
> --- 155,164 
> === gcc Summary ===
>
> ! # of expected passes  192563
># of unexpected failures 109
># of unexpected successes19
># of expected failures   1506
> ! # of unsupported tests2619
> ! /home/lucier/programs/gcc/objdirs/gcc-mainline/gcc/xgcc  version
> 14.0.0 20230802 (experimental) (GCC)
>
> === libatomic tests ===
>
> I then configured and built GCC with
>
>   ../../gcc-mainline/configure CXX="/pkgs/gcc-mainline-new/bin/g++
> -Wdisabled-optimization" --enable-languages=c --disable-multilib
> --prefix=/pkgs/gcc-mainline-test --disable-werror --disable-bootstrap
>
> to test the new warning.  The warnings are of the form, e.g.,
>
> ../../../gcc-mainline/gcc/tree-vect-stmts.cc:11990:44: warning: cannot
> apply sibling-call optimization: callee required more stack slots than
> the caller [-Wdisabled-optimization]
>
> These are the number of times this warning was triggered building stage1:
>
> grep warning: build.log | grep sibling | sed 's/^.*://' | sort | uniq -c
>  259  callee required more stack slots than the caller
> [-Wdisabled-optimization]
>   43  callee returns a structure [-Wdisabled-optimization]
>
> If this patch is OK, someone else will need to commit it for me.
>
> Brad
>
> gcc/Changelog
>
> * calls.cc (maybe_complain_about_tail_call) Add warning when
> tail or sibling call cannot be optimized.
Hi Bradley,
I don't have comments on the patch, but a new warning will also
require a corresponding entry in doc/invoke.texi.

Thanks,
Prathamesh
>
> diff --git a/gcc/calls.cc b/gcc/calls.cc
> index 1f3a6d5c450..b95c876fda8 100644
> --- a/gcc/calls.cc
> +++ b/gcc/calls.cc
> @@ -1242,10 +1242,12 @@ void
>   maybe_complain_about_tail_call (tree call_expr, const char *reason)
>   {
> gcc_assert (TREE_CODE (call_expr) == CALL_EXPR);
> -  if (!CALL_EXPR_MUST_TAIL_CALL (call_expr))
> -return;
> -
> -  error_at (EXPR_LOCATION (call_expr), "cannot tail-call: %s", reason);
> +  if (CALL_EXPR_MUST_TAIL_CALL (call_expr))
> +error_at (EXPR_LOCATION (call_expr), "cannot tail-call: %s", reason);
> +  else if (flag_optimize_sibling_calls)
> +warning (OPT_Wdisabled_optimization,
> + "cannot apply sibling-call optimization: %s", reason);
> +  return;
>   }
>
>   /* Fill in ARGS_SIZE and ARGS array based on the parameters found in
>
>


Re: [PATCH] poly_int: Handle more can_div_trunc_p cases

2023-08-04 Thread Prathamesh Kulkarni via Gcc-patches
On Thu, 3 Aug 2023 at 18:15, Richard Sandiford
 wrote:
>
> can_div_trunc_p (a, b, , ) tries to compute a Q and r that
> satisfy the usual conditions for truncating division:
>
>  (1) a = b * Q + r
>  (2) |b * Q| <= |a|
>  (3) |r| < |b|
>
> We can compute Q using the constant component (the case when
> all indeterminates are zero).  Since |r| < |b| for the constant
> case, the requirements for indeterminate xi with coefficients
> ai (for a) and bi (for b) are:
>
>  (2') |bi * Q| <= |ai|
>  (3') |ai - bi * Q| <= |bi|
>
> (See the big comment for more details, restrictions, and reasoning).
>
> However, the function works on abstract arithmetic types, and so
> it has to be careful not to introduce new overflow.  The code
> therefore only handled the extreme for (3'), that is:
>
>  |ai - bi * Q| = |bi|
>
> for the case where Q is zero.
>
> Looking at it again, the overflow issue is a bit easier to handle than
> I'd originally thought (or so I hope).  This patch therefore extends the
> code to handle |ai - bi * Q| = |bi| for all Q, with Q = 0 no longer
> being a separate case.
>
> The net effect is to allow the function to succeed for things like:
>
>  (a0 + b1 (Q+1) x) / (b0 + b1 x)
>
> where Q = a0 / b0, with various sign conditions.  E.g. we now handle:
>
>  (7 + 8x) / (4 + 4x)
>
> with Q = 1 and r = 3 + 4x,
>
> Tested on aarch64-linux-gnu.  OK to install?
Hi Richard,
Thanks for the fix! With this patch, I can confirm we correctly select arg1,
when a pattern in sel has len = 4 + 4x, a1 = 5 + 4x and ae = 7 + 8x.

Thanks,
Prathamesh

>
> Richard
>
>
> gcc/
> * poly-int.h (can_div_trunc_p): Succeed for more boundary conditions.
>
> gcc/testsuite/
> * gcc.dg/plugin/poly-int-tests.h (test_can_div_trunc_p_const)
> (test_can_div_trunc_p_const): Add more tests.
> ---
>  gcc/poly-int.h   | 45 ++-
>  gcc/testsuite/gcc.dg/plugin/poly-int-tests.h | 85 +---
>  2 files changed, 98 insertions(+), 32 deletions(-)
>
> diff --git a/gcc/poly-int.h b/gcc/poly-int.h
> index 12571455081..7bff5e5ad26 100644
> --- a/gcc/poly-int.h
> +++ b/gcc/poly-int.h
> @@ -2355,28 +2355,31 @@ can_div_trunc_p (const poly_int_pod ,
> }
>else
> {
> - if (q == 0)
> -   {
> - /* For Q == 0 we simply need: (3') |ai| <= |bi|.  */
> - if (a.coeffs[i] != ICa (0))
> -   {
> - /* Use negative absolute to avoid overflow, i.e.
> --|ai| >= -|bi|.  */
> - C neg_abs_a = (a.coeffs[i] < 0 ? a.coeffs[i] : 
> -a.coeffs[i]);
> - C neg_abs_b = (b.coeffs[i] < 0 ? b.coeffs[i] : 
> -b.coeffs[i]);
> - if (neg_abs_a < neg_abs_b)
> -   return false;
> - rem_p = true;
> -   }
> -   }
> + /* The only unconditional arithmetic that we can do on ai,
> +bi and Q is ai / bi and ai % bi.  (ai == minimum int and
> +bi == -1 would be UB in the caller.)  Anything else runs
> +the risk of overflow.  */
> + auto qi = NCa (a.coeffs[i]) / NCb (b.coeffs[i]);
> + auto ri = NCa (a.coeffs[i]) % NCb (b.coeffs[i]);
> + /* (2') and (3') are satisfied when ai /[trunc] bi == q.
> +So is the stricter condition |ai - bi * Q| < |bi|.  */
> + if (qi == q)
> +   rem_p |= (ri != 0);
> + /* The only other case is when:
> +
> +|bi * Q| + |bi| = |ai| (for (2'))
> +and |ai - bi * Q|   = |bi| (for (3'))
> +
> +The first is equivalent to |bi|(|Q| + 1) == |ai|.
> +The second requires ai == bi * (Q + 1) or ai == bi * (Q - 1).  */
> + else if (ri != 0)
> +   return false;
> + else if (q <= 0 && qi < q && qi + 1 == q)
> +   ;
> + else if (q >= 0 && qi > q && qi - 1 == q)
> +   ;
>   else
> -   {
> - /* Otherwise just check for the case in which ai / bi == Q.  */
> - if (NCa (a.coeffs[i]) / NCb (b.coeffs[i]) != q)
> -   return false;
> - if (NCa (a.coeffs[i]) % NCb (b.coeffs[i]) != 0)
> -   rem_p = true;
> -   }
> +   return false;
> }
>  }
>
> diff --git a/gcc/testsuite/gcc.dg/plugin/poly-int-tests.h 
> b/gcc/testsuite/gcc.dg/plugin/poly-int-tests.h
> index 0b89acd91cd..7af98595a5e 100644
> --- a/gcc/testsuite/gcc.dg/plugin/poly-int-tests.h
> +++ b/gcc/testsuite/gcc.dg/plugin/poly-int-tests.h
> @@ -1899,14 +1899,19 @@ test_can_div_trunc_p_const ()
> ph::make (4, 8, 12),
> _quot));
>ASSERT_EQ (const_quot, C (2));
> -  ASSERT_EQ (can_div_trunc_p (ph::make (15, 25, 40),
> +  ASSERT_TRUE (can_div_trunc_p (ph::make (15, 25, 40),
> +   ph::make (4, 8, 10),
> +   _quot));
> +  

Re: [RFC] [v2] Extend fold_vec_perm to handle VLA vectors

2023-08-04 Thread Prathamesh Kulkarni via Gcc-patches
On Thu, 3 Aug 2023 at 18:46, Richard Sandiford
 wrote:
>
> Richard Sandiford  writes:
> > Prathamesh Kulkarni  writes:
> >> On Tue, 25 Jul 2023 at 18:25, Richard Sandiford
> >>  wrote:
> >>>
> >>> Hi,
> >>>
> >>> Thanks for the rework and sorry for the slow review.
> >> Hi Richard,
> >> Thanks for the suggestions!  Please find my responses inline below.
> >>>
> >>> Prathamesh Kulkarni  writes:
> >>> > Hi Richard,
> >>> > This is reworking of patch to extend fold_vec_perm to handle VLA 
> >>> > vectors.
> >>> > The attached patch unifies handling of VLS and VLA vector_csts, while
> >>> > using fallback code
> >>> > for ctors.
> >>> >
> >>> > For VLS vector, the patch ignores underlying encoding, and
> >>> > uses npatterns = nelts, and nelts_per_pattern = 1.
> >>> >
> >>> > For VLA patterns, if sel has a stepped sequence, then it
> >>> > only chooses elements from a particular pattern of a particular
> >>> > input vector.
> >>> >
> >>> > To make things simpler, the patch imposes following constraints:
> >>> > (a) op0_npatterns, op1_npatterns and sel_npatterns are powers of 2.
> >>> > (b) The step size for a stepped sequence is a power of 2, and
> >>> >   multiple of npatterns of chosen input vector.
> >>> > (c) Runtime vector length of sel is a multiple of sel_npatterns.
> >>> >  So, we don't handle sel.length = 2 + 2x and npatterns = 4.
> >>> >
> >>> > Eg:
> >>> > op0, op1: npatterns = 2, nelts_per_pattern = 3
> >>> > op0_len = op1_len = 16 + 16x.
> >>> > sel = { 0, 0, 2, 0, 4, 0, ... }
> >>> > npatterns = 2, nelts_per_pattern = 3.
> >>> >
> >>> > For pattern {0, 2, 4, ...}
> >>> > Let,
> >>> > a1 = 2
> >>> > S = step size = 2
> >>> >
> >>> > Let Esel denote number of elements per pattern in sel at runtime.
> >>> > Esel = (16 + 16x) / npatterns_sel
> >>> > = (16 + 16x) / 2
> >>> > = (8 + 8x)
> >>> >
> >>> > So, last element of pattern:
> >>> > ae = a1 + (Esel - 2) * S
> >>> >  = 2 + (8 + 8x - 2) * 2
> >>> >  = 14 + 16x
> >>> >
> >>> > a1 /trunc arg0_len = 2 / (16 + 16x) = 0
> >>> > ae /trunc arg0_len = (14 + 16x) / (16 + 16x) = 0
> >>> > Since both are equal with quotient = 0, we select elements from op0.
> >>> >
> >>> > Since step size (S) is a multiple of npatterns(op0), we select
> >>> > all elements from same pattern of op0.
> >>> >
> >>> > res_npatterns = max (op0_npatterns, max (op1_npatterns, sel_npatterns))
> >>> >= max (2, max (2, 2)
> >>> >= 2
> >>> >
> >>> > res_nelts_per_pattern = max (op0_nelts_per_pattern,
> >>> > max 
> >>> > (op1_nelts_per_pattern,
> >>> >  
> >>> > sel_nelts_per_pattern))
> >>> > = max (3, max (3, 3))
> >>> > = 3
> >>> >
> >>> > So res has encoding with npatterns = 2, nelts_per_pattern = 3.
> >>> > res: { op0[0], op0[0], op0[2], op0[0], op0[4], op0[0], ... }
> >>> >
> >>> > Unfortunately, this results in an issue for poly_int_cst index:
> >>> > For example,
> >>> > op0, op1: npatterns = 1, nelts_per_pattern = 3
> >>> > op0_len = op1_len = 4 + 4x
> >>> >
> >>> > sel: { 4 + 4x, 5 + 4x, 6 + 4x, ... } // should choose op1
> >>> >
> >>> > In this case,
> >>> > a1 = 5 + 4x
> >>> > S = (6 + 4x) - (5 + 4x) = 1
> >>> > Esel = 4 + 4x
> >>> >
> >>> > ae = a1 + (esel - 2) * S
> >>> >  = (5 + 4x) + (4 + 4x - 2) * 1
> >>> >  = 7 + 8x
> >>> >
> >>> > IIUC, 7 + 8x will always be index for last element of op1 ?
> >>> > if x = 0, len = 4, 7 + 8x = 7
> >>> > if x = 1, len = 8, 7 + 8x = 15, et

Re: [PATCH][RFC] tree-optimization/92335 - Improve sinking heuristics for vectorization

2023-08-03 Thread Prathamesh Kulkarni via Gcc-patches
On Wed, 2 Aug 2023 at 14:17, Richard Biener via Gcc-patches
 wrote:
>
> On Mon, 31 Jul 2023, Jeff Law wrote:
>
> >
> >
> > On 7/28/23 01:05, Richard Biener via Gcc-patches wrote:
> > > The following delays sinking of loads within the same innermost
> > > loop when it was unconditional before.  That's a not uncommon
> > > issue preventing vectorization when masked loads are not available.
> > >
> > > Bootstrapped and tested on x86_64-unknown-linux-gnu.
> > >
> > > I have a followup patch improving sinking that without this would
> > > cause more of the problematic sinking - now that we have a second
> > > sink pass after loop opts this looks like a reasonable approach?
> > >
> > > OK?
> > >
> > > Thanks,
> > > Richard.
> > >
> > >  PR tree-optimization/92335
> > >  * tree-ssa-sink.cc (select_best_block): Before loop
> > >  optimizations avoid sinking unconditional loads/stores
> > >  in innermost loops to conditional executed places.
> > >
> > >  * gcc.dg/tree-ssa/ssa-sink-10.c: Disable vectorizing.
> > >  * gcc.dg/tree-ssa/predcom-9.c: Clone from ssa-sink-10.c,
> > >  expect predictive commoning to happen instead of sinking.
> > >  * gcc.dg/vect/pr65947-3.c: Adjust.
> > I think it's reasonable -- there's probably going to be cases where it's not
> > great, but more often than not I think it's going to be a reasonable
> > heuristic.
> >
> > If there is undesirable fallout, better to find it over the coming months 
> > than
> > next spring.  So I'd suggest we go forward now to give more time to find any
> > pathological cases (if they exist).
>
> Agreed, I've pushed this now.
Hi Richard,
After this patch (committed in 399c8dd44ff44f4b496223c7cc980651c4d6f6a0),
pr65947-7.c "failed" for aarch64-linux-gnu:
FAIL: gcc.dg/vect/pr65947-7.c scan-tree-dump-not vect "LOOP VECTORIZED"
FAIL: gcc.dg/vect/pr65947-7.c -flto -ffat-lto-objects
scan-tree-dump-not vect "LOOP VECTORIZED"

/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" { target {
! vect_fold_extract_last } } } } */

With your commit, condition_reduction in pr65947-7.c gets vectorized
regardless of vect_fold_extract_last,
which gates the above test (which is an improvement, because the
function didn't get vectorized before the commit).

The attached patch thus removes the gating on vect_fold_extract_last,
and the test passes again.
OK to commit ?

Thanks,
Prathamesh
>
> Richard.
diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-7.c 
b/gcc/testsuite/gcc.dg/vect/pr65947-7.c
index 16cdcd1c6eb..7dabae81abf 100644
--- a/gcc/testsuite/gcc.dg/vect/pr65947-7.c
+++ b/gcc/testsuite/gcc.dg/vect/pr65947-7.c
@@ -52,5 +52,4 @@ main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target 
vect_fold_extract_last } } } */
-/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" { target { ! 
vect_fold_extract_last } } } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */


Re: [COMMITTEDv3] tree-optimization: [PR100864] `(a&!b) | b` is not opimized to `a | b` for comparisons

2023-08-03 Thread Prathamesh Kulkarni via Gcc-patches
On Thu, 3 Aug 2023 at 02:54, Andrew Pinski  wrote:
>
> On Wed, Aug 2, 2023 at 10:14 AM Andrew Pinski  wrote:
> >
> > On Wed, Aug 2, 2023 at 10:13 AM Prathamesh Kulkarni via Gcc-patches
> >  wrote:
> > >
> > > On Mon, 31 Jul 2023 at 22:39, Andrew Pinski via Gcc-patches
> > >  wrote:
> > > >
> > > > This is a new version of the patch.
> > > > Instead of doing the matching of inversion comparison directly inside
> > > > match, creating a new function (bitwise_inverted_equal_p) to do it.
> > > > It is very similar to bitwise_equal_p that was added in 
> > > > r14-2751-g2a3556376c69a1fb
> > > > but instead it says `expr1 == ~expr2`. A follow on patch, will
> > > > use this function in other patterns where we try to match `@0` and 
> > > > `(bit_not @0)`.
> > > >
> > > > Changed the name bitwise_not_equal_p to bitwise_inverted_equal_p.
> > > >
> > > > Committed as approved after a Bootstrapped and test on x86_64-linux-gnu 
> > > > with no regressions.
> > > Hi Andrew,
> > > Unfortunately, this patch (committed in
> > > 2bae476b511dc441bf61da8a49cca655575e7dd6) causes
> > > segmentation fault for pr33133.c on aarch64-linux-gnu because of
> > > infinite recursion.
> >
> > A similar issue is recorded as PR 110874 which I am debugging right now.
>
> Yes the issue is the same and is solved by the same patch.
That's great, thanks for the heads up!

Thanks,
Prathamesh
>
> Thanks,
> Andrew
>
> >
> > Thanks,
> > Andrew
> >
> > >
> > > Running the test under gdb shows:
> > > Program received signal SIGSEGV, Segmentation fault.
> > > operand_compare::operand_equal_p (this=0x29dc680
> > > , arg0=0xf7789a68, arg1=0xf7789f30,
> > > flags=16) at ../../gcc/gcc/fold-const.cc:3088
> > > 3088{
> > > (gdb) bt
> > > #0  operand_compare::operand_equal_p (this=0x29dc680
> > > , arg0=0xf7789a68, arg1=0xf7789f30,
> > > flags=16) at ../../gcc/gcc/fold-const.cc:3088
> > > #1  0x00a90394 in operand_compare::verify_hash_value
> > > (this=this@entry=0x29dc680 ,
> > > arg0=arg0@entry=0xf7789a68, arg1=arg1@entry=0xf7789f30,
> > > flags=flags@entry=0, ret=ret@entry=0xfc000157)
> > > at ../../gcc/gcc/fold-const.cc:4074
> > > #2  0x00a9351c in operand_compare::verify_hash_value
> > > (ret=0xfc000157, flags=0, arg1=0xf7789f30,
> > > arg0=0xf7789a68, this=0x29dc680 ) at
> > > ../../gcc/gcc/fold-const.cc:4072
> > > #3  operand_compare::operand_equal_p (this=this@entry=0x29dc680
> > > , arg0=arg0@entry=0xf7789a68,
> > > arg1=arg1@entry=0xf7789f30, flags=flags@entry=0) at
> > > ../../gcc/gcc/fold-const.cc:3090
> > > #4  0x00a9791c in operand_equal_p
> > > (arg0=arg0@entry=0xf7789a68, arg1=arg1@entry=0xf7789f30,
> > > flags=flags@entry=0) at ../../gcc/gcc/fold-const.cc:4105
> > > #5  0x01d38dd0 in gimple_bitwise_inverted_equal_p
> > > (expr1=0xf7789a68, expr2=0xf7789f30, valueize=
> > > 0x112d698 ) at
> > > ../../gcc/gcc/gimple-match-head.cc:284
> > > #6  0x01d38e80 in gimple_bitwise_inverted_equal_p
> > > (expr1=0xf7789a68, expr2=0xf77d0240,
> > > valueize=0x112d698 ) at
> > > ../../gcc/gcc/gimple-match-head.cc:296
> > > #7  0x01d38e80 in gimple_bitwise_inverted_equal_p
> > > (expr1=0xf7789a68, expr2=0xf7789f30,
> > > valueize=0x112d698 ) at
> > > ../../gcc/gcc/gimple-match-head.cc:296
> > > #8  0x01d38e80 in gimple_bitwise_inverted_equal_p
> > > (expr1=0xf7789a68, expr2=0xf77d0240,
> > > ...
> > >
> > > It seems to recurse cyclically with expr2=0xf7789f30 ->
> > > expr2=0xf77d0240 eventually leading to segfault.
> > > while expr1=0xf7789a68 remains same throughout the stack frames.
> > >
> > > Thanks,
> > > Prathamesh
> > > >
> > > > PR tree-optimization/100864
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > * generic-match-head.cc (bitwise_inverted_equal_p): New 
> > > > function.
> > > > * gimple-match-head.cc (bitwise_inverted_equal_p): New macro.
> > > > (gimple_bitwise_inverted_equal_p): New function.
> > > > * match.pd ((~x | y) & x): Use 

Re: [COMMITTEDv3] tree-optimization: [PR100864] `(a&!b) | b` is not opimized to `a | b` for comparisons

2023-08-02 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 31 Jul 2023 at 22:39, Andrew Pinski via Gcc-patches
 wrote:
>
> This is a new version of the patch.
> Instead of doing the matching of inversion comparison directly inside
> match, creating a new function (bitwise_inverted_equal_p) to do it.
> It is very similar to bitwise_equal_p that was added in 
> r14-2751-g2a3556376c69a1fb
> but instead it says `expr1 == ~expr2`. A follow on patch, will
> use this function in other patterns where we try to match `@0` and `(bit_not 
> @0)`.
>
> Changed the name bitwise_not_equal_p to bitwise_inverted_equal_p.
>
> Committed as approved after a Bootstrapped and test on x86_64-linux-gnu with 
> no regressions.
Hi Andrew,
Unfortunately, this patch (committed in
2bae476b511dc441bf61da8a49cca655575e7dd6) causes
segmentation fault for pr33133.c on aarch64-linux-gnu because of
infinite recursion.

Running the test under gdb shows:
Program received signal SIGSEGV, Segmentation fault.
operand_compare::operand_equal_p (this=0x29dc680
, arg0=0xf7789a68, arg1=0xf7789f30,
flags=16) at ../../gcc/gcc/fold-const.cc:3088
3088{
(gdb) bt
#0  operand_compare::operand_equal_p (this=0x29dc680
, arg0=0xf7789a68, arg1=0xf7789f30,
flags=16) at ../../gcc/gcc/fold-const.cc:3088
#1  0x00a90394 in operand_compare::verify_hash_value
(this=this@entry=0x29dc680 ,
arg0=arg0@entry=0xf7789a68, arg1=arg1@entry=0xf7789f30,
flags=flags@entry=0, ret=ret@entry=0xfc000157)
at ../../gcc/gcc/fold-const.cc:4074
#2  0x00a9351c in operand_compare::verify_hash_value
(ret=0xfc000157, flags=0, arg1=0xf7789f30,
arg0=0xf7789a68, this=0x29dc680 ) at
../../gcc/gcc/fold-const.cc:4072
#3  operand_compare::operand_equal_p (this=this@entry=0x29dc680
, arg0=arg0@entry=0xf7789a68,
arg1=arg1@entry=0xf7789f30, flags=flags@entry=0) at
../../gcc/gcc/fold-const.cc:3090
#4  0x00a9791c in operand_equal_p
(arg0=arg0@entry=0xf7789a68, arg1=arg1@entry=0xf7789f30,
flags=flags@entry=0) at ../../gcc/gcc/fold-const.cc:4105
#5  0x01d38dd0 in gimple_bitwise_inverted_equal_p
(expr1=0xf7789a68, expr2=0xf7789f30, valueize=
0x112d698 ) at
../../gcc/gcc/gimple-match-head.cc:284
#6  0x01d38e80 in gimple_bitwise_inverted_equal_p
(expr1=0xf7789a68, expr2=0xf77d0240,
valueize=0x112d698 ) at
../../gcc/gcc/gimple-match-head.cc:296
#7  0x01d38e80 in gimple_bitwise_inverted_equal_p
(expr1=0xf7789a68, expr2=0xf7789f30,
valueize=0x112d698 ) at
../../gcc/gcc/gimple-match-head.cc:296
#8  0x01d38e80 in gimple_bitwise_inverted_equal_p
(expr1=0xf7789a68, expr2=0xf77d0240,
...

It seems to recurse cyclically with expr2=0xf7789f30 ->
expr2=0xf77d0240 eventually leading to segfault.
while expr1=0xf7789a68 remains same throughout the stack frames.

Thanks,
Prathamesh
>
> PR tree-optimization/100864
>
> gcc/ChangeLog:
>
> * generic-match-head.cc (bitwise_inverted_equal_p): New function.
> * gimple-match-head.cc (bitwise_inverted_equal_p): New macro.
> (gimple_bitwise_inverted_equal_p): New function.
> * match.pd ((~x | y) & x): Use bitwise_inverted_equal_p
> instead of direct matching bit_not.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/tree-ssa/bitops-3.c: New test.
> ---
>  gcc/generic-match-head.cc| 42 ++
>  gcc/gimple-match-head.cc | 71 
>  gcc/match.pd |  5 +-
>  gcc/testsuite/gcc.dg/tree-ssa/bitops-3.c | 67 ++
>  4 files changed, 183 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/bitops-3.c
>
> diff --git a/gcc/generic-match-head.cc b/gcc/generic-match-head.cc
> index a71c0727b0b..ddaf22f2179 100644
> --- a/gcc/generic-match-head.cc
> +++ b/gcc/generic-match-head.cc
> @@ -121,3 +121,45 @@ bitwise_equal_p (tree expr1, tree expr2)
>  return wi::to_wide (expr1) == wi::to_wide (expr2);
>return operand_equal_p (expr1, expr2, 0);
>  }
> +
> +/* Return true if EXPR1 and EXPR2 have the bitwise opposite value,
> +   but not necessarily same type.
> +   The types can differ through nop conversions.  */
> +
> +static inline bool
> +bitwise_inverted_equal_p (tree expr1, tree expr2)
> +{
> +  STRIP_NOPS (expr1);
> +  STRIP_NOPS (expr2);
> +  if (expr1 == expr2)
> +return false;
> +  if (!tree_nop_conversion_p (TREE_TYPE (expr1), TREE_TYPE (expr2)))
> +return false;
> +  if (TREE_CODE (expr1) == INTEGER_CST && TREE_CODE (expr2) == INTEGER_CST)
> +return wi::to_wide (expr1) == ~wi::to_wide (expr2);
> +  if (operand_equal_p (expr1, expr2, 0))
> +return false;
> +  if (TREE_CODE (expr1) == BIT_NOT_EXPR
> +  && bitwise_equal_p (TREE_OPERAND (expr1, 0), expr2))
> +return true;
> +  if (TREE_CODE (expr2) == BIT_NOT_EXPR
> +  && bitwise_equal_p (expr1, TREE_OPERAND (expr2, 0)))
> +return true;
> +  if (COMPARISON_CLASS_P (expr1)
> +  

Re: [PATCH v2] combine: Narrow comparison of memory and constant

2023-08-01 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 1 Aug 2023 at 05:20, Jeff Law  wrote:
>
>
>
> On 7/31/23 15:43, Prathamesh Kulkarni via Gcc-patches wrote:
> > On Mon, 19 Jun 2023 at 19:59, Stefan Schulze Frielinghaus via
> > Gcc-patches  wrote:
> >>
> >> Comparisons between memory and constants might be done in a smaller mode
> >> resulting in smaller constants which might finally end up as immediates
> >> instead of in the literal pool.
> >>
> >> For example, on s390x a non-symmetric comparison like
> >>x <= 0x3fff
> >> results in the constant being spilled to the literal pool and an 8 byte
> >> memory comparison is emitted.  Ideally, an equivalent comparison
> >>x0 <= 0x3f
> >> where x0 is the most significant byte of x, is emitted where the
> >> constant is smaller and more likely to materialize as an immediate.
> >>
> >> Similarly, comparisons of the form
> >>x >= 0x4000
> >> can be shortened into x0 >= 0x40.
> >>
> >> Bootstrapped and regtested on s390x, x64, aarch64, and powerpc64le.
> >> Note, the new tests show that for the mentioned little-endian targets
> >> the optimization does not materialize since either the costs of the new
> >> instructions are higher or they do not match.  Still ok for mainline?
> > Hi Stefan,
> > Unfortunately this patch (committed in 
> > 7cdd0860949c6c3232e6cff1d7ca37bb5234074c)
> > caused the following ICE on armv8l-unknown-linux-gnu:
> > during RTL pass: combine
> > ../../../gcc/libgcc/fixed-bit.c: In function ‘__gnu_saturate1sq’:
> > ../../../gcc/libgcc/fixed-bit.c:210:1: internal compiler error: in
> > decompose, at rtl.h:2297
> >210 | }
> >| ^
> > 0xaa23e3 wi::int_traits
> >> ::decompose(long long*, unsigned int, std::pair > machine_mode> const&)
> >  ../../gcc/gcc/rtl.h:2297
> [ ... ]
> Yea, we're seeing something very similar on nios2-linux-gnu building the
> kernel.
>
> Prathamesh, can you extract the .i file for fixed-bit on armv8 and open
> a bug for this issue, attaching the .i file as well as the right command
> line options necessary to reproduce the failure.  THat way Stefan can
> tackle it with a cross compiler.
Hi Jeff,
Filed the issue in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110867

Thanks,
Prathamesh
>
> Thanks,
> jeff


Re: [PATCH v2] combine: Narrow comparison of memory and constant

2023-07-31 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 1 Aug 2023 at 03:13, Prathamesh Kulkarni
 wrote:
>
> On Mon, 19 Jun 2023 at 19:59, Stefan Schulze Frielinghaus via
> Gcc-patches  wrote:
> >
> > Comparisons between memory and constants might be done in a smaller mode
> > resulting in smaller constants which might finally end up as immediates
> > instead of in the literal pool.
> >
> > For example, on s390x a non-symmetric comparison like
> >   x <= 0x3fff
> > results in the constant being spilled to the literal pool and an 8 byte
> > memory comparison is emitted.  Ideally, an equivalent comparison
> >   x0 <= 0x3f
> > where x0 is the most significant byte of x, is emitted where the
> > constant is smaller and more likely to materialize as an immediate.
> >
> > Similarly, comparisons of the form
> >   x >= 0x4000
> > can be shortened into x0 >= 0x40.
> >
> > Bootstrapped and regtested on s390x, x64, aarch64, and powerpc64le.
> > Note, the new tests show that for the mentioned little-endian targets
> > the optimization does not materialize since either the costs of the new
> > instructions are higher or they do not match.  Still ok for mainline?
> Hi Stefan,
> Unfortunately this patch (committed in 
> 7cdd0860949c6c3232e6cff1d7ca37bb5234074c)
> caused the following ICE on armv8l-unknown-linux-gnu:
Sorry I meant armv8l-unknown-linux-gnueabihf.
> during RTL pass: combine
> ../../../gcc/libgcc/fixed-bit.c: In function ‘__gnu_saturate1sq’:
> ../../../gcc/libgcc/fixed-bit.c:210:1: internal compiler error: in
> decompose, at rtl.h:2297
>   210 | }
>   | ^
> 0xaa23e3 wi::int_traits
> >::decompose(long long*, unsigned int, std::pair machine_mode> const&)
> ../../gcc/gcc/rtl.h:2297
> 0xaf5ab3 wide_int_ref_storage true>::wide_int_ref_storage
> >(std::pair const&)
> ../../gcc/gcc/wide-int.h:1030
> 0xaf5023 generic_wide_int
> >::generic_wide_int
> >(std::pair const&)
> ../../gcc/gcc/wide-int.h:788
> 0xf916f9 simplify_const_unary_operation(rtx_code, machine_mode,
> rtx_def*, machine_mode)
> ../../gcc/gcc/simplify-rtx.cc:2131
> 0xf8bad5 simplify_context::simplify_unary_operation(rtx_code,
> machine_mode, rtx_def*, machine_mode)
> ../../gcc/gcc/simplify-rtx.cc:889
> 0xf8a591 simplify_context::simplify_gen_unary(rtx_code, machine_mode,
> rtx_def*, machine_mode)
> ../../gcc/gcc/simplify-rtx.cc:360
> 0x9bd1b7 simplify_gen_unary(rtx_code, machine_mode, rtx_def*, machine_mode)
> ../../gcc/gcc/rtl.h:3520
> 0x1bd5677 simplify_comparison
> ../../gcc/gcc/combine.cc:13125
> 0x1bc2b2b simplify_set
> ../../gcc/gcc/combine.cc:6848
> 0x1bc1647 combine_simplify_rtx
> ../../gcc/gcc/combine.cc:6353
> 0x1bbf97f subst
> ../../gcc/gcc/combine.cc:5609
> 0x1bb864b try_combine
> ../../gcc/gcc/combine.cc:3302
> 0x1bb30fb combine_instructions
> ../../gcc/gcc/combine.cc:1264
> 0x1bd8d25 rest_of_handle_combine
> ../../gcc/gcc/combine.cc:15059
> 0x1bd8dd5 execute
> ../../gcc/gcc/combine.cc:15103
> Please submit a full bug report, with preprocessed source (by using
> -freport-bug).
> Please include the complete backtrace with any bug report.
> See <https://gcc.gnu.org/bugs/> for instructions.
>
> Could you please take a look ?
>
> Thanks,
> Prathamesh
> >
> > gcc/ChangeLog:
> >
> > * combine.cc (simplify_compare_const): Narrow comparison of
> > memory and constant.
> > (try_combine): Adapt new function signature.
> > (simplify_comparison): Adapt new function signature.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.dg/cmp-mem-const-1.c: New test.
> > * gcc.dg/cmp-mem-const-2.c: New test.
> > * gcc.dg/cmp-mem-const-3.c: New test.
> > * gcc.dg/cmp-mem-const-4.c: New test.
> > * gcc.dg/cmp-mem-const-5.c: New test.
> > * gcc.dg/cmp-mem-const-6.c: New test.
> > * gcc.target/s390/cmp-mem-const-1.c: New test.
> > ---
> >  gcc/combine.cc| 79 +--
> >  gcc/testsuite/gcc.dg/cmp-mem-const-1.c| 17 
> >  gcc/testsuite/gcc.dg/cmp-mem-const-2.c| 17 
> >  gcc/testsuite/gcc.dg/cmp-mem-const-3.c| 17 
> >  gcc/testsuite/gcc.dg/cmp-mem-const-4.c| 17 
> >  gcc/testsuite/gcc.dg/cmp-mem-const-5.c| 17 
> >  gcc/testsuite/gcc.dg/cmp-mem-const-6.c| 17 
> >  .../gcc.target/s390/cmp-mem-const-1.c | 24 ++
> >  8 files changed, 200 insertions(+), 5 de

Re: [PATCH v2] combine: Narrow comparison of memory and constant

2023-07-31 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 19 Jun 2023 at 19:59, Stefan Schulze Frielinghaus via
Gcc-patches  wrote:
>
> Comparisons between memory and constants might be done in a smaller mode
> resulting in smaller constants which might finally end up as immediates
> instead of in the literal pool.
>
> For example, on s390x a non-symmetric comparison like
>   x <= 0x3fff
> results in the constant being spilled to the literal pool and an 8 byte
> memory comparison is emitted.  Ideally, an equivalent comparison
>   x0 <= 0x3f
> where x0 is the most significant byte of x, is emitted where the
> constant is smaller and more likely to materialize as an immediate.
>
> Similarly, comparisons of the form
>   x >= 0x4000
> can be shortened into x0 >= 0x40.
>
> Bootstrapped and regtested on s390x, x64, aarch64, and powerpc64le.
> Note, the new tests show that for the mentioned little-endian targets
> the optimization does not materialize since either the costs of the new
> instructions are higher or they do not match.  Still ok for mainline?
Hi Stefan,
Unfortunately this patch (committed in 7cdd0860949c6c3232e6cff1d7ca37bb5234074c)
caused the following ICE on armv8l-unknown-linux-gnu:
during RTL pass: combine
../../../gcc/libgcc/fixed-bit.c: In function ‘__gnu_saturate1sq’:
../../../gcc/libgcc/fixed-bit.c:210:1: internal compiler error: in
decompose, at rtl.h:2297
  210 | }
  | ^
0xaa23e3 wi::int_traits
>::decompose(long long*, unsigned int, std::pair const&)
../../gcc/gcc/rtl.h:2297
0xaf5ab3 wide_int_ref_storage::wide_int_ref_storage
>(std::pair const&)
../../gcc/gcc/wide-int.h:1030
0xaf5023 generic_wide_int
>::generic_wide_int
>(std::pair const&)
../../gcc/gcc/wide-int.h:788
0xf916f9 simplify_const_unary_operation(rtx_code, machine_mode,
rtx_def*, machine_mode)
../../gcc/gcc/simplify-rtx.cc:2131
0xf8bad5 simplify_context::simplify_unary_operation(rtx_code,
machine_mode, rtx_def*, machine_mode)
../../gcc/gcc/simplify-rtx.cc:889
0xf8a591 simplify_context::simplify_gen_unary(rtx_code, machine_mode,
rtx_def*, machine_mode)
../../gcc/gcc/simplify-rtx.cc:360
0x9bd1b7 simplify_gen_unary(rtx_code, machine_mode, rtx_def*, machine_mode)
../../gcc/gcc/rtl.h:3520
0x1bd5677 simplify_comparison
../../gcc/gcc/combine.cc:13125
0x1bc2b2b simplify_set
../../gcc/gcc/combine.cc:6848
0x1bc1647 combine_simplify_rtx
../../gcc/gcc/combine.cc:6353
0x1bbf97f subst
../../gcc/gcc/combine.cc:5609
0x1bb864b try_combine
../../gcc/gcc/combine.cc:3302
0x1bb30fb combine_instructions
../../gcc/gcc/combine.cc:1264
0x1bd8d25 rest_of_handle_combine
../../gcc/gcc/combine.cc:15059
0x1bd8dd5 execute
../../gcc/gcc/combine.cc:15103
Please submit a full bug report, with preprocessed source (by using
-freport-bug).
Please include the complete backtrace with any bug report.
See  for instructions.

Could you please take a look ?

Thanks,
Prathamesh
>
> gcc/ChangeLog:
>
> * combine.cc (simplify_compare_const): Narrow comparison of
> memory and constant.
> (try_combine): Adapt new function signature.
> (simplify_comparison): Adapt new function signature.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/cmp-mem-const-1.c: New test.
> * gcc.dg/cmp-mem-const-2.c: New test.
> * gcc.dg/cmp-mem-const-3.c: New test.
> * gcc.dg/cmp-mem-const-4.c: New test.
> * gcc.dg/cmp-mem-const-5.c: New test.
> * gcc.dg/cmp-mem-const-6.c: New test.
> * gcc.target/s390/cmp-mem-const-1.c: New test.
> ---
>  gcc/combine.cc| 79 +--
>  gcc/testsuite/gcc.dg/cmp-mem-const-1.c| 17 
>  gcc/testsuite/gcc.dg/cmp-mem-const-2.c| 17 
>  gcc/testsuite/gcc.dg/cmp-mem-const-3.c| 17 
>  gcc/testsuite/gcc.dg/cmp-mem-const-4.c| 17 
>  gcc/testsuite/gcc.dg/cmp-mem-const-5.c| 17 
>  gcc/testsuite/gcc.dg/cmp-mem-const-6.c| 17 
>  .../gcc.target/s390/cmp-mem-const-1.c | 24 ++
>  8 files changed, 200 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-2.c
>  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-3.c
>  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-4.c
>  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-5.c
>  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-6.c
>  create mode 100644 gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
>
> diff --git a/gcc/combine.cc b/gcc/combine.cc
> index 5aa0ec5c45a..56e15a93409 100644
> --- a/gcc/combine.cc
> +++ b/gcc/combine.cc
> @@ -460,7 +460,7 @@ static rtx simplify_shift_const (rtx, enum rtx_code, 
> machine_mode, rtx,
>  static int recog_for_combine (rtx *, rtx_insn *, rtx *);
>  static rtx gen_lowpart_for_combine (machine_mode, rtx);
>  static enum rtx_code simplify_compare_const (enum 

Re: [C PATCH]: Add Walloc-type to warn about insufficient size in allocations

2023-07-31 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 21 Jul 2023 at 16:52, Martin Uecker via Gcc-patches
 wrote:
>
>
>
> This patch adds a warning for allocations with insufficient size
> based on the "alloc_size" attribute and the type of the pointer
> the result is assigned to. While it is theoretically legal to
> assign to the wrong pointer type and cast it to the right type
> later, this almost always indicates an error. Since this catches
> common mistakes and is simple to diagnose, it is suggested to
> add this warning.
>
>
> Bootstrapped and regression tested on x86.
>
>
> Martin
>
>
>
> Add option Walloc-type that warns about allocations that have
> insufficient storage for the target type of the pointer the
> storage is assigned to.
>
> gcc:
> * doc/invoke.texi: Document -Wstrict-flex-arrays option.
>
> gcc/c-family:
>
> * c.opt (Walloc-type): New option.
>
> gcc/c:
> * c-typeck.cc (convert_for_assignment): Add Walloc-type warning.
>
> gcc/testsuite:
>
> * gcc.dg/Walloc-type-1.c: New test.
>
>
> diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt
> index 4abdc8d0e77..8b9d148582b 100644
> --- a/gcc/c-family/c.opt
> +++ b/gcc/c-family/c.opt
> @@ -319,6 +319,10 @@ Walloca
>  C ObjC C++ ObjC++ Var(warn_alloca) Warning
>  Warn on any use of alloca.
>
> +Walloc-type
> +C ObjC Var(warn_alloc_type) Warning
> +Warn when allocating insufficient storage for the target type of the
> assigned pointer.
> +
>  Walloc-size-larger-than=
>  C ObjC C++ LTO ObjC++ Var(warn_alloc_size_limit) Joined Host_Wide_Int
> ByteSize Warning Init(HOST_WIDE_INT_MAX)
>  -Walloc-size-larger-than=   Warn for calls to allocation
> functions that
> diff --git a/gcc/c/c-typeck.cc b/gcc/c/c-typeck.cc
> index 7cf411155c6..2e392f9c952 100644
> --- a/gcc/c/c-typeck.cc
> +++ b/gcc/c/c-typeck.cc
> @@ -7343,6 +7343,32 @@ convert_for_assignment (location_t location,
> location_t expr_loc, tree type,
> "request for implicit conversion "
> "from %qT to %qT not permitted in C++", rhstype,
> type);
>
> +  /* Warn of new allocations are not big enough for the target
> type.  */
> +  tree fndecl;
> +  if (warn_alloc_type
> + && TREE_CODE (rhs) == CALL_EXPR
> + && (fndecl = get_callee_fndecl (rhs)) != NULL_TREE
> + && DECL_IS_MALLOC (fndecl))
> +   {
> + tree fntype = TREE_TYPE (fndecl);
> + tree fntypeattrs = TYPE_ATTRIBUTES (fntype);
> + tree alloc_size = lookup_attribute ("alloc_size",
> fntypeattrs);
> + if (alloc_size)
> +   {
> + tree args = TREE_VALUE (alloc_size);
> + int idx = TREE_INT_CST_LOW (TREE_VALUE (args)) - 1;
> + /* For calloc only use the second argument.  */
> + if (TREE_CHAIN (args))
> +   idx = TREE_INT_CST_LOW (TREE_VALUE (TREE_CHAIN
> (args))) - 1;
> + tree arg = CALL_EXPR_ARG (rhs, idx);
> + if (TREE_CODE (arg) == INTEGER_CST
> + && tree_int_cst_lt (arg, TYPE_SIZE_UNIT (ttl)))
Hi Martin,
Just wondering if it'd be a good idea perhaps to warn if alloc size is
not a multiple of TYPE_SIZE_UNIT instead of just less-than ?
So it can catch cases like:
int *p = malloc (sizeof (int) + 2); // probably intended malloc
(sizeof (int) * 2)

FWIW, this is caught using -fanalyzer:
f.c: In function 'f':
f.c:3:12: warning: allocated buffer size is not a multiple of the
pointee's size [CWE-131] [-Wanalyzer-allocation-size]
3 |   int *p = __builtin_malloc (sizeof(int) + 2);
  |^~

Thanks,
Prathamesh
> +warning_at (location, OPT_Walloc_type, "allocation of
> "
> +"insufficient size %qE for type %qT with
> "
> +"size %qE", arg, ttl, TYPE_SIZE_UNIT
> (ttl));
> +   }
> +   }
> +
>/* See if the pointers point to incompatible address spaces.  */
>asl = TYPE_ADDR_SPACE (ttl);
>asr = TYPE_ADDR_SPACE (ttr);
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 88e3c625030..6869bed64c3 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -8076,6 +8076,15 @@ always leads to a call to another @code{cold}
> function such as wrappers of
>  C++ @code{throw} or fatal error reporting functions leading to
> @code{abort}.
>  @end table
>
> +@opindex Wno-alloc-type
> +@opindex Walloc-type
> +@item -Walloc-type
> +Warn about calls to allocation functions decorated with attribute
> +@code{alloc_size} that specify insufficient size for the target type
> of
> +the pointer the result is assigned to, including those to the built-in
> +forms of the functions @code{aligned_alloc}, @code{alloca},
> @code{calloc},
> +@code{malloc}, and @code{realloc}.
> +
>  @opindex Wno-alloc-zero
>  @opindex Walloc-zero
>  @item -Walloc-zero
> diff --git a/gcc/testsuite/gcc.dg/Walloc-type-1.c
> b/gcc/testsuite/gcc.dg/Walloc-type-1.c
> new file mode 100644
> index 000..bc62e5e9aa3
> --- 

Re: [gcc-13] Backport PR10280 fix

2023-07-31 Thread Prathamesh Kulkarni via Gcc-patches
On Thu, 27 Jul 2023 at 12:04, Richard Biener  wrote:
>
> On Wed, 26 Jul 2023, Prathamesh Kulkarni wrote:
>
> > Sorry, I meant PR110280 in subject line (not PR10280).
>
> OK after 13.2 is released and the branch is open again.
Thanks, committed the patch to releases/gcc-13 branch in:
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=f4029de35fb1b293a4fd586574b1b4b73ddf7880

Thanks,
Prathamesh
>
> Richard.
>
> > On Wed, 26 Jul 2023 at 23:03, Prathamesh Kulkarni
> >  wrote:
> > >
> > > Hi Richard,
> > > Sorry for the delay in backport to gcc-13.
> > > The attached patch (cherry picked from master) is bootstrapped+tested
> > > on aarch64-linux-gnu with SVE enabled on gcc-13 branch.
> > > OK to commit to gcc-13 branch ?
> > >
> > > Thanks,
> > > Prathamesh
> >


Re: [RFC] [v2] Extend fold_vec_perm to handle VLA vectors

2023-07-28 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 25 Jul 2023 at 18:25, Richard Sandiford
 wrote:
>
> Hi,
>
> Thanks for the rework and sorry for the slow review.
Hi Richard,
Thanks for the suggestions!  Please find my responses inline below.
>
> Prathamesh Kulkarni  writes:
> > Hi Richard,
> > This is reworking of patch to extend fold_vec_perm to handle VLA vectors.
> > The attached patch unifies handling of VLS and VLA vector_csts, while
> > using fallback code
> > for ctors.
> >
> > For VLS vector, the patch ignores underlying encoding, and
> > uses npatterns = nelts, and nelts_per_pattern = 1.
> >
> > For VLA patterns, if sel has a stepped sequence, then it
> > only chooses elements from a particular pattern of a particular
> > input vector.
> >
> > To make things simpler, the patch imposes following constraints:
> > (a) op0_npatterns, op1_npatterns and sel_npatterns are powers of 2.
> > (b) The step size for a stepped sequence is a power of 2, and
> >   multiple of npatterns of chosen input vector.
> > (c) Runtime vector length of sel is a multiple of sel_npatterns.
> >  So, we don't handle sel.length = 2 + 2x and npatterns = 4.
> >
> > Eg:
> > op0, op1: npatterns = 2, nelts_per_pattern = 3
> > op0_len = op1_len = 16 + 16x.
> > sel = { 0, 0, 2, 0, 4, 0, ... }
> > npatterns = 2, nelts_per_pattern = 3.
> >
> > For pattern {0, 2, 4, ...}
> > Let,
> > a1 = 2
> > S = step size = 2
> >
> > Let Esel denote number of elements per pattern in sel at runtime.
> > Esel = (16 + 16x) / npatterns_sel
> > = (16 + 16x) / 2
> > = (8 + 8x)
> >
> > So, last element of pattern:
> > ae = a1 + (Esel - 2) * S
> >  = 2 + (8 + 8x - 2) * 2
> >  = 14 + 16x
> >
> > a1 /trunc arg0_len = 2 / (16 + 16x) = 0
> > ae /trunc arg0_len = (14 + 16x) / (16 + 16x) = 0
> > Since both are equal with quotient = 0, we select elements from op0.
> >
> > Since step size (S) is a multiple of npatterns(op0), we select
> > all elements from same pattern of op0.
> >
> > res_npatterns = max (op0_npatterns, max (op1_npatterns, sel_npatterns))
> >= max (2, max (2, 2)
> >= 2
> >
> > res_nelts_per_pattern = max (op0_nelts_per_pattern,
> > max (op1_nelts_per_pattern,
> >  
> > sel_nelts_per_pattern))
> > = max (3, max (3, 3))
> > = 3
> >
> > So res has encoding with npatterns = 2, nelts_per_pattern = 3.
> > res: { op0[0], op0[0], op0[2], op0[0], op0[4], op0[0], ... }
> >
> > Unfortunately, this results in an issue for poly_int_cst index:
> > For example,
> > op0, op1: npatterns = 1, nelts_per_pattern = 3
> > op0_len = op1_len = 4 + 4x
> >
> > sel: { 4 + 4x, 5 + 4x, 6 + 4x, ... } // should choose op1
> >
> > In this case,
> > a1 = 5 + 4x
> > S = (6 + 4x) - (5 + 4x) = 1
> > Esel = 4 + 4x
> >
> > ae = a1 + (esel - 2) * S
> >  = (5 + 4x) + (4 + 4x - 2) * 1
> >  = 7 + 8x
> >
> > IIUC, 7 + 8x will always be index for last element of op1 ?
> > if x = 0, len = 4, 7 + 8x = 7
> > if x = 1, len = 8, 7 + 8x = 15, etc.
> > So the stepped sequence will always choose elements
> > from op1 regardless of vector length for above case ?
> >
> > However,
> > ae /trunc op0_len
> > = (7 + 8x) / (4 + 4x)
> > which is not defined because 7/4 != 8/4
> > and we return NULL_TREE, but I suppose the expected result would be:
> > res: { op1[0], op1[1], op1[2], ... } ?
> >
> > The patch passes bootstrap+test on aarch64-linux-gnu with and without sve,
> > and on x86_64-unknown-linux-gnu.
> > I would be grateful for suggestions on how to proceed.
> >
> > Thanks,
> > Prathamesh
> >
> > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> > index a02ede79fed..8028b3e8e9a 100644
> > --- a/gcc/fold-const.cc
> > +++ b/gcc/fold-const.cc
> > @@ -85,6 +85,10 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "vec-perm-indices.h"
> >  #include "asan.h"
> >  #include "gimple-range.h"
> > +#include 
> > +#include "tree-pretty-print.h"
> > +#include "gimple-pretty-print.h"
> > +#include "print-tree.h"
> >
> >  /* Nonzero if we are folding constants inside an initializer or a C++
> > manifestly-constant

Re: [gcc-13] Backport PR10280 fix

2023-07-26 Thread Prathamesh Kulkarni via Gcc-patches
Sorry, I meant PR110280 in subject line (not PR10280).

On Wed, 26 Jul 2023 at 23:03, Prathamesh Kulkarni
 wrote:
>
> Hi Richard,
> Sorry for the delay in backport to gcc-13.
> The attached patch (cherry picked from master) is bootstrapped+tested
> on aarch64-linux-gnu with SVE enabled on gcc-13 branch.
> OK to commit to gcc-13 branch ?
>
> Thanks,
> Prathamesh


[gcc-13] Backport PR10280 fix

2023-07-26 Thread Prathamesh Kulkarni via Gcc-patches
Hi Richard,
Sorry for the delay in backport to gcc-13.
The attached patch (cherry picked from master) is bootstrapped+tested
on aarch64-linux-gnu with SVE enabled on gcc-13 branch.
OK to commit to gcc-13 branch ?

Thanks,
Prathamesh
[aarch64/match.pd] Fix ICE observed in PR110280.

gcc/ChangeLog:
PR tree-optimization/110280
* match.pd (vec_perm_expr(v, v, mask) -> v): Explicitly build vector
using build_vector_from_val with the element of input operand, and
mask's type if operand and mask's types don't match.

gcc/testsuite/ChangeLog:
PR tree-optimization/110280
* gcc.target/aarch64/sve/pr110280.c: New test.

(cherry picked from commit 85d8e0d8d5342ec8b4e6a54e22741c30b33c6f04)

diff --git a/gcc/match.pd b/gcc/match.pd
index 91182448250..c3bb4fbc0a7 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8292,7 +8292,14 @@ and,
 
 (simplify
  (vec_perm vec_same_elem_p@0 @0 @1)
- @0)
+ (if (types_match (type, TREE_TYPE (@0)))
+  @0
+  (with
+   {
+ tree elem = uniform_vector_p (@0);
+   }
+   (if (elem)
+{ build_vector_from_val (type, elem); }
 
 /* Push VEC_PERM earlier if that may help FMA perception (PR101895).  */
 (simplify
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr110280.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr110280.c
new file mode 100644
index 000..d3279f38362
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr110280.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-optimized" } */
+
+#include "arm_sve.h"
+
+svuint32_t l()
+{
+  _Alignas(16) const unsigned int lanes[4] = {0, 0, 0, 0};
+  return svld1rq_u32(svptrue_b8(), lanes);
+}
+
+/* { dg-final { scan-tree-dump-not "VEC_PERM_EXPR" "optimized" } } */


Re: [RFC] [v2] Extend fold_vec_perm to handle VLA vectors

2023-07-25 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 17 Jul 2023 at 17:44, Prathamesh Kulkarni
 wrote:
>
> Hi Richard,
> This is reworking of patch to extend fold_vec_perm to handle VLA vectors.
> The attached patch unifies handling of VLS and VLA vector_csts, while
> using fallback code
> for ctors.
>
> For VLS vector, the patch ignores underlying encoding, and
> uses npatterns = nelts, and nelts_per_pattern = 1.
>
> For VLA patterns, if sel has a stepped sequence, then it
> only chooses elements from a particular pattern of a particular
> input vector.
>
> To make things simpler, the patch imposes following constraints:
> (a) op0_npatterns, op1_npatterns and sel_npatterns are powers of 2.
> (b) The step size for a stepped sequence is a power of 2, and
>   multiple of npatterns of chosen input vector.
> (c) Runtime vector length of sel is a multiple of sel_npatterns.
>  So, we don't handle sel.length = 2 + 2x and npatterns = 4.
>
> Eg:
> op0, op1: npatterns = 2, nelts_per_pattern = 3
> op0_len = op1_len = 16 + 16x.
> sel = { 0, 0, 2, 0, 4, 0, ... }
> npatterns = 2, nelts_per_pattern = 3.
>
> For pattern {0, 2, 4, ...}
> Let,
> a1 = 2
> S = step size = 2
>
> Let Esel denote number of elements per pattern in sel at runtime.
> Esel = (16 + 16x) / npatterns_sel
> = (16 + 16x) / 2
> = (8 + 8x)
>
> So, last element of pattern:
> ae = a1 + (Esel - 2) * S
>  = 2 + (8 + 8x - 2) * 2
>  = 14 + 16x
>
> a1 /trunc arg0_len = 2 / (16 + 16x) = 0
> ae /trunc arg0_len = (14 + 16x) / (16 + 16x) = 0
> Since both are equal with quotient = 0, we select elements from op0.
>
> Since step size (S) is a multiple of npatterns(op0), we select
> all elements from same pattern of op0.
>
> res_npatterns = max (op0_npatterns, max (op1_npatterns, sel_npatterns))
>= max (2, max (2, 2)
>= 2
>
> res_nelts_per_pattern = max (op0_nelts_per_pattern,
> max (op1_nelts_per_pattern,
>  
> sel_nelts_per_pattern))
> = max (3, max (3, 3))
> = 3
>
> So res has encoding with npatterns = 2, nelts_per_pattern = 3.
> res: { op0[0], op0[0], op0[2], op0[0], op0[4], op0[0], ... }
>
> Unfortunately, this results in an issue for poly_int_cst index:
> For example,
> op0, op1: npatterns = 1, nelts_per_pattern = 3
> op0_len = op1_len = 4 + 4x
>
> sel: { 4 + 4x, 5 + 4x, 6 + 4x, ... } // should choose op1
>
> In this case,
> a1 = 5 + 4x
> S = (6 + 4x) - (5 + 4x) = 1
> Esel = 4 + 4x
>
> ae = a1 + (esel - 2) * S
>  = (5 + 4x) + (4 + 4x - 2) * 1
>  = 7 + 8x
>
> IIUC, 7 + 8x will always be index for last element of op1 ?
> if x = 0, len = 4, 7 + 8x = 7
> if x = 1, len = 8, 7 + 8x = 15, etc.
> So the stepped sequence will always choose elements
> from op1 regardless of vector length for above case ?
>
> However,
> ae /trunc op0_len
> = (7 + 8x) / (4 + 4x)
> which is not defined because 7/4 != 8/4
> and we return NULL_TREE, but I suppose the expected result would be:
> res: { op1[0], op1[1], op1[2], ... } ?
>
> The patch passes bootstrap+test on aarch64-linux-gnu with and without sve,
> and on x86_64-unknown-linux-gnu.
> I would be grateful for suggestions on how to proceed.
Hi Richard,
ping: https://gcc.gnu.org/pipermail/gcc-patches/2023-July/624675.html

Thanks,
Prathamesh
>
> Thanks,
> Prathamesh


Re: [WIP RFC] analyzer: Add optional trim of the analyzer diagnostics going too deep [PR110543]

2023-07-22 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 21 Jul 2023 at 21:05, Benjamin Priour via Gcc-patches
 wrote:
>
> Hi,
>
> Upon David's request I've joined the in progress patch to the below email.
> I hope it makes more sense now.
>
> Best,
> Benjamin.
>
> -- Forwarded message -
> From: Benjamin Priour 
> Date: Tue, Jul 18, 2023 at 3:30 PM
> Subject: [RFC] analyzer: Add optional trim of the analyzer diagnostics
> going too deep [PR110543]
> To: , David Malcolm 
>
>
> Hi,
>
> I'd like to request comments on a patch I am writing for PR110543.
> The goal of this patch is to reduce the noise of the analyzer emitted
> diagnostics when dealing with
> system headers, or simply diagnostic paths that are too long. The new
> option only affects the display
> of the diagnostics, but doesn't hinder the actual analysis.
>
> I've defaulted the new option to "system", thus preventing the diagnostic
> paths from showing system headers.
> "never" corresponds to the pre-patch behavior, whereas you can also specify
> an unsigned value 
> that prevents paths to go deeper than  frames.
>
> fanalyzer-trim-diagnostics=
> > Common Joined RejectNegative ToLower Var(flag_analyzer_trim_diagnostics)
> > Init("system")
> > -fanalyzer-trim-diagnostics=[never|system|] Trim diagnostics
> > path that are too long before emission.
> >
>
> Does it sounds reasonable and user-friendly ?
>
> Regstrapping was a success against trunk, although one of the newly added
> test case fails for c++14.
> Note that the test case below was done with "never", thus behaves exactly
> as the pre-patch analyzer
> on x86_64-linux-gnu.
>
> /* { dg-additional-options "-fdiagnostics-plain-output
> > -fdiagnostics-path-format=inline-events -fanalyzer-trim-diagnostics=never"
> > } */
> > /* { dg-skip-if "" { c++98_only }  } */
> >
> > #include 
> > struct A {int x; int y;};
> >
> > int main () {
> >   std::shared_ptr a;
> >   a->x = 4; /* { dg-line deref_a } */
> >   /* { dg-warning "dereference of NULL" "" { target *-*-* } deref_a } */
> >
> >   return 0;
> > }
> >
> > /* { dg-begin-multiline-output "" }
> >   'int main()': events 1-2
> > |
> > |
> > +--> 'std::__shared_ptr_access<_Tp, _Lp, , 
> > >::element_type* std::__shared_ptr_access<_Tp, _Lp, ,
> >  >::operator->() const [with _Tp = A; __gnu_cxx::_Lock_policy
> > _Lp = __gnu_cxx::_S_atomic; bool  = false; bool  =
> > false]': events 3-4
> >|
> >|
> >+--> 'std::__shared_ptr_access<_Tp, _Lp, ,
> >  >::element_type* std::__shared_ptr_access<_Tp, _Lp,
> > ,  >::_M_get() const [with _Tp = A;
> > __gnu_cxx::_Lock_policy _Lp = __gnu_cxx::_S_atomic; bool  =
> > false; bool  = false]': events 5-6
> >   |
> >   |
> >   +--> 'std::__shared_ptr<_Tp, _Lp>::element_type*
> > std::__shared_ptr<_Tp, _Lp>::get() const [with _Tp = A;
> > __gnu_cxx::_Lock_policy _Lp = __gnu_cxx::_S_atomic]': events 7-8
> >  |
> >  |
> >   <--+
> >   |
> > 'std::__shared_ptr_access<_Tp, _Lp, ,
> >  >::element_type* std::__shared_ptr_access<_Tp, _Lp,
> > ,  >::_M_get() const [with _Tp = A;
> > __gnu_cxx::_Lock_policy _Lp = __gnu_cxx::_S_atomic; bool  =
> > false; bool  = false]': event 9
> >   |
> >   |
> ><--+
> >|
> >  'std::__shared_ptr_access<_Tp, _Lp, , 
> > >::element_type* std::__shared_ptr_access<_Tp, _Lp, ,
> >  >::operator->() const [with _Tp = A; __gnu_cxx::_Lock_policy
> > _Lp = __gnu_cxx::_S_atomic; bool  = false; bool  =
> > false]': event 10
> >|
> >|
> > <--+
> > |
> >   'int main()': events 11-12
> > |
> > |
> >{ dg-end-multiline-output "" } */
> >
>
>
> The first events "'int main()': events 1-2" vary in c++14 (get events 1-3).
>
> >
> > // c++14 with fully detailed output
> >   ‘int main()’: events 1-3
> > |
> > |8 | int main () {
> > |  | ^~~~
> > |  | |
> > |  | (1) entry to ‘main’
> > |9 |   std::shared_ptr a;
> > |  |  ~
> > |  |  |
> > |  |  (2)
> > ‘a.std::shared_ptr::.std::__shared_ptr > __gnu_cxx::_S_atomic>::_M_ptr’ is NULL
> > |   10 |   a->x = 4; /* { dg-line deref_a } */
> > |  |~~
> > |  ||
> > |  |(3) calling ‘std::__shared_ptr_access > __gnu_cxx::_S_atomic, false, false>::operator->’ from ‘main’
> >
>
> whereas c++17 and posterior give
>
> > // c++17 with fully detailed output
> >
> // ./xg++ -fanalyzer
> >  ../../gcc/gcc/testsuite/g++.dg/analyzer/fanalyzer-trim-diagnostics-never.C
> >  -B. -shared-libgcc -fanalyzer-trim-diagnostics=never -std=c++17
> >
>   ‘int main()’: events 1-2
> > |
> > |8 | int main () {
> > |  | ^~~~
> > |  | |
> > |  | (1) entry to ‘main’
> > |9 |   std::shared_ptr a;
> >   

Re: PING^1 [PATCH v7] tree-ssa-sink: Improve code sinking pass

2023-07-18 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 18 Jul 2023 at 13:26, Ajit Agarwal via Gcc-patches
 wrote:
>
>
> Ping!
>
> please review.
>
> Thanks & Regards
> Ajit
>
>
> This patch improves code sinking pass to sink statements before call to reduce
> register pressure.
> Review comments are incorporated.
>
> For example :
>
> void bar();
> int j;
> void foo(int a, int b, int c, int d, int e, int f)
> {
>   int l;
>   l = a + b + c + d +e + f;
>   if (a != 5)
> {
>   bar();
>   j = l;
> }
> }
>
> Code Sinking does the following:
>
> void bar();
> int j;
> void foo(int a, int b, int c, int d, int e, int f)
> {
>   int l;
>
>   if (a != 5)
> {
>   l = a + b + c + d +e + f;
>   bar();
>   j = l;
> }
> }
>
> Bootstrapped regtested on powerpc64-linux-gnu.
>
> Thanks & Regards
> Ajit
>
>
> tree-ssa-sink: Improve code sinking pass
>
> Currently, code sinking will sink code after function calls.  This increases
> register pressure for callee-saved registers.  The following patch improves
> code sinking by placing the sunk code before calls in the use block or in
> the immediate dominator of the use blocks.
>
> 2023-06-01  Ajit Kumar Agarwal  
>
> gcc/ChangeLog:
>
> PR tree-optimization/81953
> * tree-ssa-sink.cc (statement_sink_location): Move statements before
> calls.
> (def_use_same_block): New function.
> (select_best_block): Add heuristics to select the best blocks in the
> immediate post dominator.
>
> gcc/testsuite/ChangeLog:
>
> PR tree-optimization/81953
> * gcc.dg/tree-ssa/ssa-sink-20.c: New testcase.
> * gcc.dg/tree-ssa/ssa-sink-21.c: New testcase.
> ---
>  gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-20.c | 15 
>  gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-21.c | 19 +
>  gcc/tree-ssa-sink.cc| 79 ++---
>  3 files changed, 87 insertions(+), 26 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-20.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-21.c
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-20.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-20.c
> new file mode 100644
> index 000..d3b79ca5803
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-20.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-sink-stats" } */
> +void bar();
> +int j;
> +void foo(int a, int b, int c, int d, int e, int f)
> +{
> +  int l;
> +  l = a + b + c + d +e + f;
> +  if (a != 5)
> +{
> +  bar();
> +  j = l;
> +}
> +}
> +/* { dg-final { scan-tree-dump 
> {l_12\s+=\s+_4\s+\+\s+f_11\(D\);\n\s+bar\s+\(\)} sink1 } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-21.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-21.c
> new file mode 100644
> index 000..84e7938c54f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-21.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-sink-stats" } */
> +void bar();
> +int j, x;
> +void foo(int a, int b, int c, int d, int e, int f)
> +{
> +  int l;
> +  l = a + b + c + d +e + f;
> +  if (a != 5)
> +{
> +  bar();
> +  if (b != 3)
> +x = 3;
> +  else
> +x = 5;
> +  j = l;
> +}
> +}
> +/* { dg-final { scan-tree-dump 
> {l_13\s+=\s+_4\s+\+\s+f_12\(D\);\n\s+bar\s+\(\)} sink1 } } */
> diff --git a/gcc/tree-ssa-sink.cc b/gcc/tree-ssa-sink.cc
> index b1ba7a2ad6c..113c89d0967 100644
> --- a/gcc/tree-ssa-sink.cc
> +++ b/gcc/tree-ssa-sink.cc
> @@ -171,9 +171,28 @@ nearest_common_dominator_of_uses (def_operand_p def_p, 
> bool *debug_stmts)
>return commondom;
>  }
>
> +/* Return TRUE if immediate defs of STMT and STMT are in same
> + * block, FALSE otherwise.  */
> +
> +static bool
> +def_use_same_block (gimple *stmt)
> +{
> +  def_operand_p def;
> +  ssa_op_iter iter;
> +
> +  FOR_EACH_SSA_DEF_OPERAND (def, stmt, iter, SSA_OP_DEF)
> +{
> +  gimple *def_stmt = SSA_NAME_DEF_STMT (DEF_FROM_PTR (def));
> +  if ((gimple_bb (def_stmt) == gimple_bb (stmt)))
> +   return true;
Hi Ajit,
Just wondering, won't this always return true since you're iterating over defs,
and def_stmt == stmt ? Sorry, if I misunderstood.

Thanks,
Prathamesh
> + }
> +  return false;
> +}
> +
>  /* Given EARLY_BB and LATE_BB, two blocks in a path through the dominator
> tree, return the best basic block between them (inclusive) to place
> -   statements.
> +   statements. The best basic block should be an immediate dominator of
> +   best basic block if the use stmt is after the call.
>
> We want the most control dependent block in the shallowest loop nest.
>
> @@ -190,11 +209,22 @@ nearest_common_dominator_of_uses (def_operand_p def_p, 
> bool *debug_stmts)
>  static basic_block
>  select_best_block (basic_block early_bb,
>basic_block late_bb,
> -  gimple *stmt)
> +  gimple *stmt,
> +  gimple *use)
>  {
>basic_block best_bb 

Re: [PATCH] aarch64: remove useless pairs of rev instructions

2023-07-18 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 18 Jul 2023 at 15:20, Serval Martinot-Lagarde via Gcc-patches
 wrote:
>
> SVE generates superflous rev instructions that can be replaced
> by single mov instruction or a pair of (rev, mov) instructions
Hi Serval,
I had added a similar transform to remove pair of rev instructions in:
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=f0eabc52c9a2d3da0bfc201da7a5c1658b76e9a4

which seems to be removing the superfluous VEC_PERM_EXPR for
your test:
For example, following pair of VEC_PERM_EXPR is input to forwprop4:
  vect__4.9_58 = VEC_PERM_EXPR ;
  vect__4.12_63 = VEC_PERM_EXPR ;

and forwprop4 dump shows:
Removing dead stmt vect__4.12_63 = vect__4.8_57;

Which shows that forwprop replaced the VEC_PERM_EXPR pair with
assignment vect__4.12_63 = vect__4.8_57 (which turned out to be
eventually dead).

Sorry if this sounds silly to ask but could you let me know how to
reproduce this issue on trunk ?
I tried using -O3 -mcpu=generic+sve for your test but grepping for rev
didn't return any results in code-gen.

Thanks,
Prathamesh
>
> gcc/
> * config/aarch64/aarch64-sve.md: New peephole2.
> * testsuite/gcc.target/aarch64/sve/revrev.c: New dg test.
>
> Signed-off-by: Serval Martinot-Lagarde 
> ---
>  gcc/config/aarch64/aarch64-sve.md | 21 +++
>  gcc/testsuite/gcc.target/aarch64/sve/revrev.c | 13 
>  2 files changed, 34 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/revrev.c
>
> diff --git a/gcc/config/aarch64/aarch64-sve.md 
> b/gcc/config/aarch64/aarch64-sve.md
> index da5534c3e32..e5e0c7ddfc5 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -8836,6 +8836,27 @@
>"TARGET_SVE"
>"rev\t%0., %1.")
>
> +(define_peephole2
> +  [(set (match_operand:SVE_ALL 0 "register_operand" "")
> +(unspec:SVE_ALL
> +  [(match_operand:SVE_ALL 1 "register_operand" "")] UNSPEC_REV))
> +   (set (match_operand:SVE_ALL 2 "register_operand" "")
> +(unspec:SVE_ALL
> +  [(match_dup 0)] UNSPEC_REV))]
> +  "TARGET_SVE"
> +  [(const_int 0)]
> +  {
> +if (REGNO (operands[2]) != REGNO (operands[0]))
> +{
> +  emit_insn (gen_rtx_SET (operands[2], operands[1]));
> +  rtx rev = gen_rtx_UNSPEC (mode, gen_rtvec (1, operands[1]), 
> UNSPEC_REV);
> +  emit_insn (gen_rtx_SET (operands[0], rev));
> +}
> +else
> +  emit_insn (gen_rtx_SET (operands[0], operands[1]));
> +DONE;
> +  })
> +
>  ;; -
>  ;;  [INT,FP] Special-purpose binary permutes
>  ;; -
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/revrev.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/revrev.c
> new file mode 100644
> index 000..04af6eed291
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/revrev.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +
> +#include 
> +
> +void
> +test (uint8_t a[], uint8_t b[], uint64_t N)
> +{
> +  for (uint64_t i = N; i > 0; i--)
> +a[i - 1] = b[i - 1];
> +}
> +
> +/* { dg-final { scan-assembler-not {\trev\t(z[0-9]+\.h), \1\n\trev\t\1, 
> \1\n} } } */
> --
> 2.21.0


[RFC] [v2] Extend fold_vec_perm to handle VLA vectors

2023-07-17 Thread Prathamesh Kulkarni via Gcc-patches
Hi Richard,
This is reworking of patch to extend fold_vec_perm to handle VLA vectors.
The attached patch unifies handling of VLS and VLA vector_csts, while
using fallback code
for ctors.

For VLS vector, the patch ignores underlying encoding, and
uses npatterns = nelts, and nelts_per_pattern = 1.

For VLA patterns, if sel has a stepped sequence, then it
only chooses elements from a particular pattern of a particular
input vector.

To make things simpler, the patch imposes following constraints:
(a) op0_npatterns, op1_npatterns and sel_npatterns are powers of 2.
(b) The step size for a stepped sequence is a power of 2, and
  multiple of npatterns of chosen input vector.
(c) Runtime vector length of sel is a multiple of sel_npatterns.
 So, we don't handle sel.length = 2 + 2x and npatterns = 4.

Eg:
op0, op1: npatterns = 2, nelts_per_pattern = 3
op0_len = op1_len = 16 + 16x.
sel = { 0, 0, 2, 0, 4, 0, ... }
npatterns = 2, nelts_per_pattern = 3.

For pattern {0, 2, 4, ...}
Let,
a1 = 2
S = step size = 2

Let Esel denote number of elements per pattern in sel at runtime.
Esel = (16 + 16x) / npatterns_sel
= (16 + 16x) / 2
= (8 + 8x)

So, last element of pattern:
ae = a1 + (Esel - 2) * S
 = 2 + (8 + 8x - 2) * 2
 = 14 + 16x

a1 /trunc arg0_len = 2 / (16 + 16x) = 0
ae /trunc arg0_len = (14 + 16x) / (16 + 16x) = 0
Since both are equal with quotient = 0, we select elements from op0.

Since step size (S) is a multiple of npatterns(op0), we select
all elements from same pattern of op0.

res_npatterns = max (op0_npatterns, max (op1_npatterns, sel_npatterns))
   = max (2, max (2, 2)
   = 2

res_nelts_per_pattern = max (op0_nelts_per_pattern,
max (op1_nelts_per_pattern,
 sel_nelts_per_pattern))
= max (3, max (3, 3))
= 3

So res has encoding with npatterns = 2, nelts_per_pattern = 3.
res: { op0[0], op0[0], op0[2], op0[0], op0[4], op0[0], ... }

Unfortunately, this results in an issue for poly_int_cst index:
For example,
op0, op1: npatterns = 1, nelts_per_pattern = 3
op0_len = op1_len = 4 + 4x

sel: { 4 + 4x, 5 + 4x, 6 + 4x, ... } // should choose op1

In this case,
a1 = 5 + 4x
S = (6 + 4x) - (5 + 4x) = 1
Esel = 4 + 4x

ae = a1 + (esel - 2) * S
 = (5 + 4x) + (4 + 4x - 2) * 1
 = 7 + 8x

IIUC, 7 + 8x will always be index for last element of op1 ?
if x = 0, len = 4, 7 + 8x = 7
if x = 1, len = 8, 7 + 8x = 15, etc.
So the stepped sequence will always choose elements
from op1 regardless of vector length for above case ?

However,
ae /trunc op0_len
= (7 + 8x) / (4 + 4x)
which is not defined because 7/4 != 8/4
and we return NULL_TREE, but I suppose the expected result would be:
res: { op1[0], op1[1], op1[2], ... } ?

The patch passes bootstrap+test on aarch64-linux-gnu with and without sve,
and on x86_64-unknown-linux-gnu.
I would be grateful for suggestions on how to proceed.

Thanks,
Prathamesh
diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index a02ede79fed..8028b3e8e9a 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -85,6 +85,10 @@ along with GCC; see the file COPYING3.  If not see
 #include "vec-perm-indices.h"
 #include "asan.h"
 #include "gimple-range.h"
+#include 
+#include "tree-pretty-print.h"
+#include "gimple-pretty-print.h"
+#include "print-tree.h"
 
 /* Nonzero if we are folding constants inside an initializer or a C++
manifestly-constant-evaluated context; zero otherwise.
@@ -10493,15 +10497,9 @@ fold_mult_zconjz (location_t loc, tree type, tree expr)
 static bool
 vec_cst_ctor_to_array (tree arg, unsigned int nelts, tree *elts)
 {
-  unsigned HOST_WIDE_INT i, nunits;
+  unsigned HOST_WIDE_INT i;
 
-  if (TREE_CODE (arg) == VECTOR_CST
-  && VECTOR_CST_NELTS (arg).is_constant ())
-{
-  for (i = 0; i < nunits; ++i)
-   elts[i] = VECTOR_CST_ELT (arg, i);
-}
-  else if (TREE_CODE (arg) == CONSTRUCTOR)
+  if (TREE_CODE (arg) == CONSTRUCTOR)
 {
   constructor_elt *elt;
 
@@ -10519,6 +10517,230 @@ vec_cst_ctor_to_array (tree arg, unsigned int nelts, 
tree *elts)
   return true;
 }
 
+/* Return a vector with (NPATTERNS, NELTS_PER_PATTERN) encoding.  */
+
+static tree
+vector_cst_reshape (tree vec, unsigned npatterns, unsigned nelts_per_pattern)
+{
+  gcc_assert (pow2p_hwi (npatterns));
+
+  if (VECTOR_CST_NPATTERNS (vec) == npatterns
+  && VECTOR_CST_NELTS_PER_PATTERN (vec) == nelts_per_pattern)
+return vec;
+
+  tree v = make_vector (exact_log2 (npatterns), nelts_per_pattern);
+  TREE_TYPE (v) = TREE_TYPE (vec);
+
+  unsigned nelts = npatterns * nelts_per_pattern;
+  for (unsigned i = 0; i < nelts; i++)
+VECTOR_CST_ENCODED_ELT(v, i) = vector_cst_elt (vec, i);
+  return v;
+}
+
+/* Helper routine for fold_vec_perm_vla to check if ARG is a suitable
+   operand for VLA vec_perm folding. If arg is VLS, then set
+   

Re: [Patch] libgomp: Use libnuma for OpenMP's partition=nearest allocation trait

2023-07-13 Thread Prathamesh Kulkarni via Gcc-patches
On Wed, 12 Jul 2023 at 17:35, Tobias Burnus  wrote:
>
> Now committed as r14-2462-g450b05ce54d3f0.
Hi Tobias,
The newly added tests in above commit -- alloc-11.c and alloc-12.c
seem to fail during execution
on armv8l-unknown-linux-gnueabihf:

Running libgomp:libgomp.c++/c++.exp ...
FAIL: libgomp.c++/../libgomp.c-c++-common/alloc-11.c execution test
FAIL: libgomp.c++/../libgomp.c-c++-common/alloc-12.c execution test

Running libgomp:libgomp.c/c.exp ...
FAIL: libgomp.c/../libgomp.c-c++-common/alloc-11.c execution test
FAIL: libgomp.c/../libgomp.c-c++-common/alloc-12.c execution test

Could you please investigate ?

Thanks,
Prathamesh
>
> Changes to the patch in previous email:
> * I fixed some issues found on the way,
> * The wording in the .texi has been improved/expanded, and
> * I included two testcases to exercise the two libraries (or
>the default allocator when it is not available at runtime).
>
> Given that the default allocation already works fine (nearest)
> and the normal "malloc" is more economic in terms of memory
> handling (not multiples of page size or requesting a fixed
> pool size), I was wondering whether this patch is really needed.
>
> But at the end: default can be changed (cf. below) and given
> the user the choice makes sense. The manual states what GCC does
> which should help to make a conscious choice.
>
> * * *
>
> I did experiment with the testcase attached to previous email
> plus using dlopen to obtain the functions from libnuma if available.
>
> It was also using:
> /* { dg-do run { target { dlopen } } } */
> /* { dg-additional-options "-ldl" } */
>
> However, the Linux kernel too often placed the allocated memory
> on the "wrong" node to be usable as a testcase. I did get be
> 0 to 15 misplaced allocations, depending on the run.
>
> Hence, there is no such testcase. Using numactrl --preferred=1 I
> could force the normal allocation to (mostly) use node 1 for
> allocations such that the difference between partiton = default/environment
> vs. partition = nearest was clearly visible. Hence it does work.
>
> Otherwise, the same applies as I wrote the yesterday:
>
> On 11.07.23 12:35, Tobias Burnus wrote:
>
> > While by default 'malloc' allocates memory on the same node as the
> > calling
> > process/thread ('numactl --show' shows 'preferred node: current',
> > Linux kernel memory policy MPOL_DEFAULT), this can be changed.
> > For instance, when running the program as follows, 'malloc' now
> > prefers to allocate on the second node:
> >   numactl --preferred=1 ./myproc
> >
> > Thus, it seems to be sensible to provide a means to ensure the 'nearest'
> > allocation.  The MPOL_LOCAL policy does so, as provided by
> > libnuma's numa_alloc_local. (Which is just wrapper around the syscalls
> > mmap and mbind.) As with (lib)memkind, there is a run-time dlopen check
> > for (lib)numa - and no numa*.h is required when bulding GCC.
> -
> Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
> München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
> Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
> München, HRB 106955


Re: Pushed: [PATCH v2] vect: Fix vectorized BIT_FIELD_REF for signed bit-fields [PR110557]

2023-07-11 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 10 Jul 2023 at 16:43, Xi Ruoyao via Gcc-patches
 wrote:
>
> On Mon, 2023-07-10 at 10:33 +, Richard Biener wrote:
> > On Fri, 7 Jul 2023, Xi Ruoyao wrote:
> >
> > > If a bit-field is signed and it's wider than the output type, we
> > > must
> > > ensure the extracted result sign-extended.  But this was not handled
> > > correctly.
> > >
> > > For example:
> > >
> > > int x : 8;
> > > long y : 55;
> > > bool z : 1;
> > >
> > > The vectorized extraction of y was:
> > >
> > > vect__ifc__49.29_110 =
> > >   MEM  [(struct Item
> > > *)vectp_a.27_108];
> > > vect_patt_38.30_112 =
> > >   vect__ifc__49.29_110 & { 9223372036854775552,
> > > 9223372036854775552 };
> > > vect_patt_39.31_113 = vect_patt_38.30_112 >> 8;
> > > vect_patt_40.32_114 =
> > >   VIEW_CONVERT_EXPR(vect_patt_39.31_113);
> > >
> > > This is obviously incorrect.  This pach has implemented it as:
> > >
> > > vect__ifc__25.16_62 =
> > >   MEM  [(struct Item
> > > *)vectp_a.14_60];
> > > vect_patt_31.17_63 =
> > >   VIEW_CONVERT_EXPR(vect__ifc__25.16_62);
> > > vect_patt_32.18_64 = vect_patt_31.17_63 << 1;
> > > vect_patt_33.19_65 = vect_patt_32.18_64 >> 9;
> >
> > OK.
>
> Pushed r14-2407 and r13-7553.
Hi Xi,
Your commit:
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=63ae6bc60c0f67fb2791991bf4b6e7e0a907d420,

seems to cause following regressions on arm-linux-gnueabihf:
FAIL: g++.dg/vect/pr110557.cc  -std=c++98 (test for excess errors)
FAIL: g++.dg/vect/pr110557.cc  -std=c++14 (test for excess errors)
FAIL: g++.dg/vect/pr110557.cc  -std=c++17 (test for excess errors)
FAIL: g++.dg/vect/pr110557.cc  -std=c++20 (test for excess errors)

Excess error:
gcc/testsuite/g++.dg/vect/pr110557.cc:12:8: warning: width of
'Item::y' exceeds its type

Thanks,
Prathamesh
>
> --
> Xi Ruoyao 
> School of Aerospace Science and Technology, Xidian University


Re: [SVE] Fold svdupq to VEC_PERM_EXPR if elements are not constant

2023-06-27 Thread Prathamesh Kulkarni via Gcc-patches
On Wed, 28 Jun 2023 at 00:05, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > Hi Richard,
> > Sorry I forgot to commit this patch, which you had approved in:
> > https://gcc.gnu.org/pipermail/gcc-patches/2023-April/615308.html
> >
> > Just for context for the following test:
> > svint32_t f_s32(int32x4_t x)
> > {
> >   return svdupq_s32 (x[0], x[1], x[2], x[3]);
> > }
> >
> > -O3 -mcpu=generic+sve generates following code after interleave+zip1 patch:
> > f_s32:
> > dup s31, v0.s[1]
> > mov v30.8b, v0.8b
> > ins v31.s[1], v0.s[3]
> > ins v30.s[1], v0.s[2]
> > zip1v0.4s, v30.4s, v31.4s
> > dup z0.q, z0.q[0]
> > ret
> >
> > Code-gen with attached patch:
> > f_s32:
> > dup z0.q, z0.q[0]
> > ret
> >
> > Bootstrapped+tested on aarch64-linux-gnu.
> > OK to commit ?
> >
> > Thanks,
> > Prathamesh
> >
> > [SVE] Fold svdupq to VEC_PERM_EXPR if elements are not constant.
> >
> > gcc/ChangeLog:
> > * config/aarch64/aarch64-sve-builtins-base.cc
> > (svdupq_impl::fold_nonconst_dupq): New method.
> > (svdupq_impl::fold): Call fold_nonconst_dupq.
> >
> > gcc/testsuite/ChangeLog:
> > * gcc.target/aarch64/sve/acle/general/dupq_11.c: New test.
>
> OK, thanks.
Thanks, pushed to trunk in 231f6b56c77c50f337f2529b3ae51e2083ce461d

Thanks,
Prathamesh
>
> Richard
>
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > index 95b4cb8a943..9010ecca6da 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > @@ -817,6 +817,52 @@ public:
> >
> >  class svdupq_impl : public quiet
> >  {
> > +private:
> > +  gimple *
> > +  fold_nonconst_dupq (gimple_folder ) const
> > +  {
> > +/* Lower lhs = svdupq (arg0, arg1, ..., argN} into:
> > +   tmp = {arg0, arg1, ..., arg}
> > +   lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...})  */
> > +
> > +if (f.type_suffix (0).bool_p
> > + || BYTES_BIG_ENDIAN)
> > +  return NULL;
> > +
> > +tree lhs = gimple_call_lhs (f.call);
> > +tree lhs_type = TREE_TYPE (lhs);
> > +tree elt_type = TREE_TYPE (lhs_type);
> > +scalar_mode elt_mode = SCALAR_TYPE_MODE (elt_type);
> > +machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
> > +tree vq_type = build_vector_type_for_mode (elt_type, vq_mode);
> > +
> > +unsigned nargs = gimple_call_num_args (f.call);
> > +vec *v;
> > +vec_alloc (v, nargs);
> > +for (unsigned i = 0; i < nargs; i++)
> > +  CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_call_arg (f.call, i));
> > +tree vec = build_constructor (vq_type, v);
> > +tree tmp = make_ssa_name_fn (cfun, vq_type, 0);
> > +gimple *g = gimple_build_assign (tmp, vec);
> > +
> > +gimple_seq stmts = NULL;
> > +gimple_seq_add_stmt_without_update (, g);
> > +
> > +poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
> > +vec_perm_builder sel (lhs_len, nargs, 1);
> > +for (unsigned i = 0; i < nargs; i++)
> > +  sel.quick_push (i);
> > +
> > +vec_perm_indices indices (sel, 1, nargs);
> > +tree mask_type = build_vector_type (ssizetype, lhs_len);
> > +tree mask = vec_perm_indices_to_tree (mask_type, indices);
> > +
> > +gimple *g2 = gimple_build_assign (lhs, VEC_PERM_EXPR, tmp, tmp, mask);
> > +gimple_seq_add_stmt_without_update (, g2);
> > +gsi_replace_with_seq (f.gsi, stmts, false);
> > +return g2;
> > +  }
> > +
> >  public:
> >gimple *
> >fold (gimple_folder ) const override
> > @@ -832,7 +878,7 @@ public:
> >{
> >   tree elt = gimple_call_arg (f.call, i);
> >   if (!CONSTANT_CLASS_P (elt))
> > -   return NULL;
> > +   return fold_nonconst_dupq (f);
> >   builder.quick_push (elt);
> >   for (unsigned int j = 1; j < factor; ++j)
> > builder.quick_push (build_zero_cst (TREE_TYPE (vec_type)));
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c 
> > b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
> > new file mode 100644
> > index 000..f19f8deb1e5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/gen

[SVE] Fold svdupq to VEC_PERM_EXPR if elements are not constant

2023-06-27 Thread Prathamesh Kulkarni via Gcc-patches
Hi Richard,
Sorry I forgot to commit this patch, which you had approved in:
https://gcc.gnu.org/pipermail/gcc-patches/2023-April/615308.html

Just for context for the following test:
svint32_t f_s32(int32x4_t x)
{
  return svdupq_s32 (x[0], x[1], x[2], x[3]);
}

-O3 -mcpu=generic+sve generates following code after interleave+zip1 patch:
f_s32:
dup s31, v0.s[1]
mov v30.8b, v0.8b
ins v31.s[1], v0.s[3]
ins v30.s[1], v0.s[2]
zip1v0.4s, v30.4s, v31.4s
dup z0.q, z0.q[0]
ret

Code-gen with attached patch:
f_s32:
dup z0.q, z0.q[0]
ret

Bootstrapped+tested on aarch64-linux-gnu.
OK to commit ?

Thanks,
Prathamesh
[SVE] Fold svdupq to VEC_PERM_EXPR if elements are not constant.

gcc/ChangeLog:
* config/aarch64/aarch64-sve-builtins-base.cc
(svdupq_impl::fold_nonconst_dupq): New method.
(svdupq_impl::fold): Call fold_nonconst_dupq.

gcc/testsuite/ChangeLog:
* gcc.target/aarch64/sve/acle/general/dupq_11.c: New test.

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 95b4cb8a943..9010ecca6da 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -817,6 +817,52 @@ public:
 
 class svdupq_impl : public quiet
 {
+private:
+  gimple *
+  fold_nonconst_dupq (gimple_folder ) const
+  {
+/* Lower lhs = svdupq (arg0, arg1, ..., argN} into:
+   tmp = {arg0, arg1, ..., arg}
+   lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...})  */
+
+if (f.type_suffix (0).bool_p
+   || BYTES_BIG_ENDIAN)
+  return NULL;
+
+tree lhs = gimple_call_lhs (f.call);
+tree lhs_type = TREE_TYPE (lhs);
+tree elt_type = TREE_TYPE (lhs_type);
+scalar_mode elt_mode = SCALAR_TYPE_MODE (elt_type);
+machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
+tree vq_type = build_vector_type_for_mode (elt_type, vq_mode);
+
+unsigned nargs = gimple_call_num_args (f.call);
+vec *v;
+vec_alloc (v, nargs);
+for (unsigned i = 0; i < nargs; i++)
+  CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_call_arg (f.call, i));
+tree vec = build_constructor (vq_type, v);
+tree tmp = make_ssa_name_fn (cfun, vq_type, 0);
+gimple *g = gimple_build_assign (tmp, vec);
+
+gimple_seq stmts = NULL;
+gimple_seq_add_stmt_without_update (, g);
+
+poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
+vec_perm_builder sel (lhs_len, nargs, 1);
+for (unsigned i = 0; i < nargs; i++)
+  sel.quick_push (i);
+
+vec_perm_indices indices (sel, 1, nargs);
+tree mask_type = build_vector_type (ssizetype, lhs_len);
+tree mask = vec_perm_indices_to_tree (mask_type, indices);
+
+gimple *g2 = gimple_build_assign (lhs, VEC_PERM_EXPR, tmp, tmp, mask);
+gimple_seq_add_stmt_without_update (, g2);
+gsi_replace_with_seq (f.gsi, stmts, false);
+return g2;
+  }
+
 public:
   gimple *
   fold (gimple_folder ) const override
@@ -832,7 +878,7 @@ public:
   {
tree elt = gimple_call_arg (f.call, i);
if (!CONSTANT_CLASS_P (elt))
- return NULL;
+ return fold_nonconst_dupq (f);
builder.quick_push (elt);
for (unsigned int j = 1; j < factor; ++j)
  builder.quick_push (build_zero_cst (TREE_TYPE (vec_type)));
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
new file mode 100644
index 000..f19f8deb1e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-optimized" } */
+
+#include 
+#include 
+
+svint8_t f_s8(int8x16_t x)
+{
+  return svdupq_s8 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
+   x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]);
+}
+
+svint16_t f_s16(int16x8_t x)
+{
+  return svdupq_s16 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7]);
+}
+
+svint32_t f_s32(int32x4_t x)
+{
+  return svdupq_s32 (x[0], x[1], x[2], x[3]);
+}
+
+svint64_t f_s64(int64x2_t x)
+{
+  return svdupq_s64 (x[0], x[1]);
+}
+
+/* { dg-final { scan-tree-dump "VEC_PERM_EXPR" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "svdupq" "optimized" } } */
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} 4 
} } */


Re: [PATCH] arm: Fix MVE intrinsics support with LTO (PR target/110268)

2023-06-26 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 26 Jun 2023 at 20:33, Christophe Lyon via Gcc-patches
 wrote:
>
> After the recent MVE intrinsics re-implementation, LTO stopped working
> because the intrinsics would no longer be defined.
>
> The main part of the patch is simple and similar to what we do for
> AArch64:
> - call handle_arm_mve_h() from arm_init_mve_builtins to declare the
>   intrinsics when the compiler is in LTO mode
> - actually implement arm_builtin_decl for MVE.
>
> It was just a bit tricky to handle __ARM_MVE_PRESERVE_USER_NAMESPACE:
> its value in the user code cannot be guessed at LTO time, so we always
> have to assume that it was not defined.  The led to a few fixes in the
> way we register MVE builtins as placeholders or not.  Without this
> patch, we would just omit some versions of the inttrinsics when
> __ARM_MVE_PRESERVE_USER_NAMESPACE is true. In fact, like for the C/C++
> placeholders, we need to always keep entries for all of them to ensure
> that we have a consistent numbering scheme.
>
> 2023-06-26  Christophe Lyon   
>
> PR target/110268
> gcc/
> * config/arm/arm-builtins.cc (arm_init_mve_builtins): Handle LTO.
> (arm_builtin_decl): Hahndle MVE builtins.
> * config/arm/arm-mve-builtins.cc (builtin_decl): New function.
> (add_unique_function): Fix handling of
> __ARM_MVE_PRESERVE_USER_NAMESPACE.
> (add_overloaded_function): Likewise.
> * config/arm/arm-protos.h (builtin_decl): New declaration.
>
> gcc/testsuite/
> * gcc.target/arm/pr110268-1.c: New test.
> * gcc.target/arm/pr110268-2.c: New test.
> ---
>  gcc/config/arm/arm-builtins.cc| 11 +++-
>  gcc/config/arm/arm-mve-builtins.cc| 61 ---
>  gcc/config/arm/arm-protos.h   |  1 +
>  gcc/testsuite/gcc.target/arm/pr110268-1.c | 11 
>  gcc/testsuite/gcc.target/arm/pr110268-2.c | 22 
>  5 files changed, 76 insertions(+), 30 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/arm/pr110268-1.c
>  create mode 100644 gcc/testsuite/gcc.target/arm/pr110268-2.c
>
> diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc
> index 36365e40a5b..fca7dcaf565 100644
> --- a/gcc/config/arm/arm-builtins.cc
> +++ b/gcc/config/arm/arm-builtins.cc
> @@ -1918,6 +1918,15 @@ arm_init_mve_builtins (void)
>arm_builtin_datum *d = _builtin_data[i];
>arm_init_builtin (fcode, d, "__builtin_mve");
>  }
> +
> +  if (in_lto_p)
> +{
> +  arm_mve::handle_arm_mve_types_h ();
> +  /* Under LTO, we cannot know whether
> +__ARM_MVE_PRESERVE_USER_NAMESPACE was defined, so assume it
> +was not.  */
> +  arm_mve::handle_arm_mve_h (false);
> +}
>  }
>
>  /* Set up all the NEON builtins, even builtins for instructions that are not
> @@ -2723,7 +2732,7 @@ arm_builtin_decl (unsigned code, bool initialize_p 
> ATTRIBUTE_UNUSED)
>  case ARM_BUILTIN_GENERAL:
>return arm_general_builtin_decl (subcode);
>  case ARM_BUILTIN_MVE:
> -  return error_mark_node;
> +  return arm_mve::builtin_decl (subcode);
>  default:
>gcc_unreachable ();
>  }
> diff --git a/gcc/config/arm/arm-mve-builtins.cc 
> b/gcc/config/arm/arm-mve-builtins.cc
> index 7033e41a571..e9a12f27411 100644
> --- a/gcc/config/arm/arm-mve-builtins.cc
> +++ b/gcc/config/arm/arm-mve-builtins.cc
> @@ -493,6 +493,16 @@ handle_arm_mve_h (bool preserve_user_namespace)
>  preserve_user_namespace);
>  }
>
> +/* Return the function decl with SVE function subcode CODE, or 
> error_mark_node
> +   if no such function exists.  */
Hi Christophe,
Sorry to nitpick -- s/SVE/MVE ? :)

Thanks,
Prathamesh
> +tree
> +builtin_decl (unsigned int code)
> +{
> +  if (code >= vec_safe_length (registered_functions))
> +return error_mark_node;
> +  return (*registered_functions)[code]->decl;
> +}
> +
>  /* Return true if CANDIDATE is equivalent to MODEL_TYPE for overloading
> purposes.  */
>  static bool
> @@ -849,7 +859,6 @@ function_builder::add_function (const function_instance 
> ,
>  ? integer_zero_node
>  : simulate_builtin_function_decl (input_location, name, fntype,
>   code, NULL, attrs);
> -
>registered_function  = *ggc_alloc  ();
>rfn.instance = instance;
>rfn.decl = decl;
> @@ -889,15 +898,12 @@ function_builder::add_unique_function (const 
> function_instance ,
>gcc_assert (!*rfn_slot);
>*rfn_slot = 
>
> -  /* Also add the non-prefixed non-overloaded function, if the user namespace
> - does not need to be preserved.  */
> -  if (!preserve_user_namespace)
> -{
> -  char *noprefix_name = get_name (instance, false, false);
> -  tree attrs = get_attributes (instance);
> -  add_function (instance, noprefix_name, fntype, attrs, requires_float,
> -   false, false);
> -}
> +  /* Also add the non-prefixed non-overloaded function, as placeholder
> 

Re: [SVE][match.pd] Fix ICE observed in PR110280

2023-06-23 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 23 Jun 2023 at 14:58, Richard Biener  wrote:
>
> On Fri, Jun 23, 2023 at 11:09 AM Prathamesh Kulkarni
>  wrote:
> >
> > On Thu, 22 Jun 2023 at 18:06, Richard Biener  
> > wrote:
> > >
> > > On Thu, Jun 22, 2023 at 11:08 AM Prathamesh Kulkarni
> > >  wrote:
> > > >
> > > > On Tue, 20 Jun 2023 at 16:47, Richard Biener 
> > > >  wrote:
> > > > >
> > > > > On Tue, Jun 20, 2023 at 11:56 AM Prathamesh Kulkarni via Gcc-patches
> > > > >  wrote:
> > > > > >
> > > > > > Hi Richard,
> > > > > > For the following reduced test-case taken from PR:
> > > > > >
> > > > > > #include "arm_sve.h"
> > > > > > svuint32_t l() {
> > > > > >   alignas(16) const unsigned int lanes[4] = {0, 0, 0, 0};
> > > > > >   return svld1rq_u32(svptrue_b8(), lanes);
> > > > > > }
> > > > > >
> > > > > > compiling with -O3 -mcpu=generic+sve results in following ICE:
> > > > > > during GIMPLE pass: fre
> > > > > > pr110280.c: In function 'l':
> > > > > > pr110280.c:5:1: internal compiler error: in eliminate_stmt, at
> > > > > > tree-ssa-sccvn.cc:6890
> > > > > > 5 | }
> > > > > >   | ^
> > > > > > 0x865fb1 eliminate_dom_walker::eliminate_stmt(basic_block_def*,
> > > > > > gimple_stmt_iterator*)
> > > > > > ../../gcc/gcc/tree-ssa-sccvn.cc:6890
> > > > > > 0x120bf4d 
> > > > > > eliminate_dom_walker::before_dom_children(basic_block_def*)
> > > > > > ../../gcc/gcc/tree-ssa-sccvn.cc:7324
> > > > > > 0x120bf4d 
> > > > > > eliminate_dom_walker::before_dom_children(basic_block_def*)
> > > > > > ../../gcc/gcc/tree-ssa-sccvn.cc:7257
> > > > > > 0x1aeec77 dom_walker::walk(basic_block_def*)
> > > > > > ../../gcc/gcc/domwalk.cc:311
> > > > > > 0x11fd924 eliminate_with_rpo_vn(bitmap_head*)
> > > > > > ../../gcc/gcc/tree-ssa-sccvn.cc:7504
> > > > > > 0x1214664 do_rpo_vn_1
> > > > > > ../../gcc/gcc/tree-ssa-sccvn.cc:8616
> > > > > > 0x1215ba5 execute
> > > > > > ../../gcc/gcc/tree-ssa-sccvn.cc:8702
> > > > > >
> > > > > > cc1 simplifies:
> > > > > >   lanes[0] = 0;
> > > > > >   lanes[1] = 0;
> > > > > >   lanes[2] = 0;
> > > > > >   lanes[3] = 0;
> > > > > >   _1 = { -1, ... };
> > > > > >   _7 = svld1rq_u32 (_1, );
> > > > > >
> > > > > > to:
> > > > > >   _9 = MEM  [(unsigned int * 
> > > > > > {ref-all})];
> > > > > >   _7 = VEC_PERM_EXPR <_9, _9, { 0, 1, 2, 3, ... }>;
> > > > > >
> > > > > > and then fre1 dump shows:
> > > > > > Applying pattern match.pd:8675, generic-match-5.cc:9025
> > > > > > Match-and-simplified VEC_PERM_EXPR <_9, _9, { 0, 1, 2, 3, ... }> to 
> > > > > > {
> > > > > > 0, 0, 0, 0 }
> > > > > > RHS VEC_PERM_EXPR <_9, _9, { 0, 1, 2, 3, ... }> simplified to { 0, 
> > > > > > 0, 0, 0 }
> > > > > >
> > > > > > The issue seems to be with the following pattern:
> > > > > > (simplify
> > > > > >  (vec_perm vec_same_elem_p@0 @0 @1)
> > > > > >  @0)
> > > > > >
> > > > > > which simplifies above VEC_PERM_EXPR to:
> > > > > > _7 = {0, 0, 0, 0}
> > > > > > which is incorrect since _9 and mask have different vector lengths.
> > > > > >
> > > > > > The attached patch amends the pattern to simplify above 
> > > > > > VEC_PERM_EXPR
> > > > > > only if operand and mask have same number of elements, which seems 
> > > > > > to fix
> > > > > > the issue, and we're left with the following in .optimized dump:
> > > > > >[local count: 1073741824]:
> > > > > >   _2 = VEC_PERM_EXPR <{ 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 1, 2, 3, 
> > > > > > ... }>;
> > > > >
> > > > > it would be nic

Re: [SVE][match.pd] Fix ICE observed in PR110280

2023-06-23 Thread Prathamesh Kulkarni via Gcc-patches
On Thu, 22 Jun 2023 at 18:06, Richard Biener  wrote:
>
> On Thu, Jun 22, 2023 at 11:08 AM Prathamesh Kulkarni
>  wrote:
> >
> > On Tue, 20 Jun 2023 at 16:47, Richard Biener  
> > wrote:
> > >
> > > On Tue, Jun 20, 2023 at 11:56 AM Prathamesh Kulkarni via Gcc-patches
> > >  wrote:
> > > >
> > > > Hi Richard,
> > > > For the following reduced test-case taken from PR:
> > > >
> > > > #include "arm_sve.h"
> > > > svuint32_t l() {
> > > >   alignas(16) const unsigned int lanes[4] = {0, 0, 0, 0};
> > > >   return svld1rq_u32(svptrue_b8(), lanes);
> > > > }
> > > >
> > > > compiling with -O3 -mcpu=generic+sve results in following ICE:
> > > > during GIMPLE pass: fre
> > > > pr110280.c: In function 'l':
> > > > pr110280.c:5:1: internal compiler error: in eliminate_stmt, at
> > > > tree-ssa-sccvn.cc:6890
> > > > 5 | }
> > > >   | ^
> > > > 0x865fb1 eliminate_dom_walker::eliminate_stmt(basic_block_def*,
> > > > gimple_stmt_iterator*)
> > > > ../../gcc/gcc/tree-ssa-sccvn.cc:6890
> > > > 0x120bf4d eliminate_dom_walker::before_dom_children(basic_block_def*)
> > > > ../../gcc/gcc/tree-ssa-sccvn.cc:7324
> > > > 0x120bf4d eliminate_dom_walker::before_dom_children(basic_block_def*)
> > > > ../../gcc/gcc/tree-ssa-sccvn.cc:7257
> > > > 0x1aeec77 dom_walker::walk(basic_block_def*)
> > > > ../../gcc/gcc/domwalk.cc:311
> > > > 0x11fd924 eliminate_with_rpo_vn(bitmap_head*)
> > > > ../../gcc/gcc/tree-ssa-sccvn.cc:7504
> > > > 0x1214664 do_rpo_vn_1
> > > > ../../gcc/gcc/tree-ssa-sccvn.cc:8616
> > > > 0x1215ba5 execute
> > > > ../../gcc/gcc/tree-ssa-sccvn.cc:8702
> > > >
> > > > cc1 simplifies:
> > > >   lanes[0] = 0;
> > > >   lanes[1] = 0;
> > > >   lanes[2] = 0;
> > > >   lanes[3] = 0;
> > > >   _1 = { -1, ... };
> > > >   _7 = svld1rq_u32 (_1, );
> > > >
> > > > to:
> > > >   _9 = MEM  [(unsigned int * {ref-all})];
> > > >   _7 = VEC_PERM_EXPR <_9, _9, { 0, 1, 2, 3, ... }>;
> > > >
> > > > and then fre1 dump shows:
> > > > Applying pattern match.pd:8675, generic-match-5.cc:9025
> > > > Match-and-simplified VEC_PERM_EXPR <_9, _9, { 0, 1, 2, 3, ... }> to {
> > > > 0, 0, 0, 0 }
> > > > RHS VEC_PERM_EXPR <_9, _9, { 0, 1, 2, 3, ... }> simplified to { 0, 0, 
> > > > 0, 0 }
> > > >
> > > > The issue seems to be with the following pattern:
> > > > (simplify
> > > >  (vec_perm vec_same_elem_p@0 @0 @1)
> > > >  @0)
> > > >
> > > > which simplifies above VEC_PERM_EXPR to:
> > > > _7 = {0, 0, 0, 0}
> > > > which is incorrect since _9 and mask have different vector lengths.
> > > >
> > > > The attached patch amends the pattern to simplify above VEC_PERM_EXPR
> > > > only if operand and mask have same number of elements, which seems to 
> > > > fix
> > > > the issue, and we're left with the following in .optimized dump:
> > > >[local count: 1073741824]:
> > > >   _2 = VEC_PERM_EXPR <{ 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 1, 2, 3, ... 
> > > > }>;
> > >
> > > it would be nice to have this optimized.
> > >
> > > -
> > >  (simplify
> > >   (vec_perm vec_same_elem_p@0 @0 @1)
> > > - @0)
> > > + (if (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0)),
> > > +   TYPE_VECTOR_SUBPARTS (TREE_TYPE (@1
> > > +  @0))
> > >
> > > that looks good I think.  Maybe even better use 'type' instead of 
> > > TREE_TYPE (@1)
> > > since that's more obviously the return type in which case
> > >
> > >   (if (types_match (type, TREE_TYPE (@0))
> > >
> > > would be more to the point.
> > >
> > > But can't you to simplify this in the !known_eq case do a simple
> > >
> > >   { build_vector_from_val (type, the-element); }
> > >
> > > ?  The 'vec_same_elem_p' predicate doesn't get you at the element,
> > >
> > >  (with { tree el = uniform_vector_p (@0); }
> > >   (if (el)
> > >{ build_vector_from_val (type, el); })))
> > >
> > > would b

Re: [SVE][match.pd] Fix ICE observed in PR110280

2023-06-22 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 20 Jun 2023 at 16:47, Richard Biener  wrote:
>
> On Tue, Jun 20, 2023 at 11:56 AM Prathamesh Kulkarni via Gcc-patches
>  wrote:
> >
> > Hi Richard,
> > For the following reduced test-case taken from PR:
> >
> > #include "arm_sve.h"
> > svuint32_t l() {
> >   alignas(16) const unsigned int lanes[4] = {0, 0, 0, 0};
> >   return svld1rq_u32(svptrue_b8(), lanes);
> > }
> >
> > compiling with -O3 -mcpu=generic+sve results in following ICE:
> > during GIMPLE pass: fre
> > pr110280.c: In function 'l':
> > pr110280.c:5:1: internal compiler error: in eliminate_stmt, at
> > tree-ssa-sccvn.cc:6890
> > 5 | }
> >   | ^
> > 0x865fb1 eliminate_dom_walker::eliminate_stmt(basic_block_def*,
> > gimple_stmt_iterator*)
> > ../../gcc/gcc/tree-ssa-sccvn.cc:6890
> > 0x120bf4d eliminate_dom_walker::before_dom_children(basic_block_def*)
> > ../../gcc/gcc/tree-ssa-sccvn.cc:7324
> > 0x120bf4d eliminate_dom_walker::before_dom_children(basic_block_def*)
> > ../../gcc/gcc/tree-ssa-sccvn.cc:7257
> > 0x1aeec77 dom_walker::walk(basic_block_def*)
> > ../../gcc/gcc/domwalk.cc:311
> > 0x11fd924 eliminate_with_rpo_vn(bitmap_head*)
> > ../../gcc/gcc/tree-ssa-sccvn.cc:7504
> > 0x1214664 do_rpo_vn_1
> > ../../gcc/gcc/tree-ssa-sccvn.cc:8616
> > 0x1215ba5 execute
> > ../../gcc/gcc/tree-ssa-sccvn.cc:8702
> >
> > cc1 simplifies:
> >   lanes[0] = 0;
> >   lanes[1] = 0;
> >   lanes[2] = 0;
> >   lanes[3] = 0;
> >   _1 = { -1, ... };
> >   _7 = svld1rq_u32 (_1, );
> >
> > to:
> >   _9 = MEM  [(unsigned int * {ref-all})];
> >   _7 = VEC_PERM_EXPR <_9, _9, { 0, 1, 2, 3, ... }>;
> >
> > and then fre1 dump shows:
> > Applying pattern match.pd:8675, generic-match-5.cc:9025
> > Match-and-simplified VEC_PERM_EXPR <_9, _9, { 0, 1, 2, 3, ... }> to {
> > 0, 0, 0, 0 }
> > RHS VEC_PERM_EXPR <_9, _9, { 0, 1, 2, 3, ... }> simplified to { 0, 0, 0, 0 }
> >
> > The issue seems to be with the following pattern:
> > (simplify
> >  (vec_perm vec_same_elem_p@0 @0 @1)
> >  @0)
> >
> > which simplifies above VEC_PERM_EXPR to:
> > _7 = {0, 0, 0, 0}
> > which is incorrect since _9 and mask have different vector lengths.
> >
> > The attached patch amends the pattern to simplify above VEC_PERM_EXPR
> > only if operand and mask have same number of elements, which seems to fix
> > the issue, and we're left with the following in .optimized dump:
> >[local count: 1073741824]:
> >   _2 = VEC_PERM_EXPR <{ 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 1, 2, 3, ... }>;
>
> it would be nice to have this optimized.
>
> -
>  (simplify
>   (vec_perm vec_same_elem_p@0 @0 @1)
> - @0)
> + (if (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0)),
> +   TYPE_VECTOR_SUBPARTS (TREE_TYPE (@1
> +  @0))
>
> that looks good I think.  Maybe even better use 'type' instead of TREE_TYPE 
> (@1)
> since that's more obviously the return type in which case
>
>   (if (types_match (type, TREE_TYPE (@0))
>
> would be more to the point.
>
> But can't you to simplify this in the !known_eq case do a simple
>
>   { build_vector_from_val (type, the-element); }
>
> ?  The 'vec_same_elem_p' predicate doesn't get you at the element,
>
>  (with { tree el = uniform_vector_p (@0); }
>   (if (el)
>{ build_vector_from_val (type, el); })))
>
> would be the cheapest workaround.
Hi Richard,
Thanks for the suggestions. Using build_vector_from_val simplifies it to:
   [local count: 1073741824]:
  return { 0, ... };

Patch is bootstrapped+tested on aarch64-linux-gnu, in progress on
x86_64-linux-gnu.
OK to commit ?

Thanks,
Prathamesh
>
> >   return _2;
> >
> > code-gen:
> > l:
> > mov z0.b, #0
> > ret
> >
> > Patch is bootstrapped+tested on aarch64-linux-gnu.
> > OK to commit ?
> >
> > Thanks,
> > Prathamesh
[aarch64/match.pd] Fix ICE observed in PR110280.

gcc/ChangeLog:
PR tree-optimization/110280
* match.pd (vec_perm_expr(v, v, mask) -> v): Explicitly build vector
using build_vector_from_val with the element of input operand, and
mask's type.

gcc/testsuite/ChangeLog:
* gcc.target/aarch64/sve/pr110280.c: New test.

diff --git a/gcc/match.pd b/gcc/match.pd
index 2dd23826034..76a37297d3c 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8672,7 +8672,12 @@ and,
 
 (simplify
  (vec_perm vec_same_elem_p@0 @0 @1)
- @0)
+ (with
+  {
+tree elem = uniform_vector_p (@0);
+  }
+  (if (elem)
+  

[SVE][match.pd] Fix ICE observed in PR110280

2023-06-20 Thread Prathamesh Kulkarni via Gcc-patches
Hi Richard,
For the following reduced test-case taken from PR:

#include "arm_sve.h"
svuint32_t l() {
  alignas(16) const unsigned int lanes[4] = {0, 0, 0, 0};
  return svld1rq_u32(svptrue_b8(), lanes);
}

compiling with -O3 -mcpu=generic+sve results in following ICE:
during GIMPLE pass: fre
pr110280.c: In function 'l':
pr110280.c:5:1: internal compiler error: in eliminate_stmt, at
tree-ssa-sccvn.cc:6890
5 | }
  | ^
0x865fb1 eliminate_dom_walker::eliminate_stmt(basic_block_def*,
gimple_stmt_iterator*)
../../gcc/gcc/tree-ssa-sccvn.cc:6890
0x120bf4d eliminate_dom_walker::before_dom_children(basic_block_def*)
../../gcc/gcc/tree-ssa-sccvn.cc:7324
0x120bf4d eliminate_dom_walker::before_dom_children(basic_block_def*)
../../gcc/gcc/tree-ssa-sccvn.cc:7257
0x1aeec77 dom_walker::walk(basic_block_def*)
../../gcc/gcc/domwalk.cc:311
0x11fd924 eliminate_with_rpo_vn(bitmap_head*)
../../gcc/gcc/tree-ssa-sccvn.cc:7504
0x1214664 do_rpo_vn_1
../../gcc/gcc/tree-ssa-sccvn.cc:8616
0x1215ba5 execute
../../gcc/gcc/tree-ssa-sccvn.cc:8702

cc1 simplifies:
  lanes[0] = 0;
  lanes[1] = 0;
  lanes[2] = 0;
  lanes[3] = 0;
  _1 = { -1, ... };
  _7 = svld1rq_u32 (_1, );

to:
  _9 = MEM  [(unsigned int * {ref-all})];
  _7 = VEC_PERM_EXPR <_9, _9, { 0, 1, 2, 3, ... }>;

and then fre1 dump shows:
Applying pattern match.pd:8675, generic-match-5.cc:9025
Match-and-simplified VEC_PERM_EXPR <_9, _9, { 0, 1, 2, 3, ... }> to {
0, 0, 0, 0 }
RHS VEC_PERM_EXPR <_9, _9, { 0, 1, 2, 3, ... }> simplified to { 0, 0, 0, 0 }

The issue seems to be with the following pattern:
(simplify
 (vec_perm vec_same_elem_p@0 @0 @1)
 @0)

which simplifies above VEC_PERM_EXPR to:
_7 = {0, 0, 0, 0}
which is incorrect since _9 and mask have different vector lengths.

The attached patch amends the pattern to simplify above VEC_PERM_EXPR
only if operand and mask have same number of elements, which seems to fix
the issue, and we're left with the following in .optimized dump:
   [local count: 1073741824]:
  _2 = VEC_PERM_EXPR <{ 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 1, 2, 3, ... }>;
  return _2;

code-gen:
l:
mov z0.b, #0
ret

Patch is bootstrapped+tested on aarch64-linux-gnu.
OK to commit ?

Thanks,
Prathamesh
[SVE][match.pd] Fix ICE observed in PR110280.

gcc/ChangeLog:
PR tree-optimization/110280
* match.pd (vec_perm_expr(v, v, mask) -> v): Simplify the pattern
only if operand and mask of VEC_PERM_EXPR have same number of
elements.

gcc/testsuite/ChangeLog:
* gcc.target/aarch64/sve/pr110280.c: New test.

diff --git a/gcc/match.pd b/gcc/match.pd
index 2dd23826034..0eb5f8f0af6 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8669,10 +8669,11 @@ and,
  @0
  (if (uniform_vector_p (@0
 
-
 (simplify
  (vec_perm vec_same_elem_p@0 @0 @1)
- @0)
+ (if (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0)),
+   TYPE_VECTOR_SUBPARTS (TREE_TYPE (@1
+  @0))
 
 /* Push VEC_PERM earlier if that may help FMA perception (PR101895).  */
 (simplify
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr110280.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr110280.c
new file mode 100644
index 000..453c9cbcf9e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr110280.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_sve.h"
+
+svuint32_t l()
+{
+  _Alignas(16) const unsigned int lanes[4] = {0, 0, 0, 0};
+  return svld1rq_u32(svptrue_b8(), lanes);
+}


Re: [PATCH v2] [PR96339] Optimise svlast[ab]

2023-06-14 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 13 Jun 2023 at 12:38, Tejas Belagod via Gcc-patches
 wrote:
>
>
>
> From: Richard Sandiford 
> Date: Monday, June 12, 2023 at 2:15 PM
> To: Tejas Belagod 
> Cc: gcc-patches@gcc.gnu.org , Tejas Belagod 
> 
> Subject: Re: [PATCH v2] [PR96339] Optimise svlast[ab]
> Tejas Belagod  writes:
> > From: Tejas Belagod 
> >
> >   This PR optimizes an SVE intrinsics sequence where
> > svlasta (svptrue_pat_b8 (SV_VL1), x)
> >   a scalar is selected based on a constant predicate and a variable vector.
> >   This sequence is optimized to return the correspoding element of a NEON
> >   vector. For eg.
> > svlasta (svptrue_pat_b8 (SV_VL1), x)
> >   returns
> > umovw0, v0.b[1]
> >   Likewise,
> > svlastb (svptrue_pat_b8 (SV_VL1), x)
> >   returns
> >  umovw0, v0.b[0]
> >   This optimization only works provided the constant predicate maps to a 
> > range
> >   that is within the bounds of a 128-bit NEON register.
> >
> > gcc/ChangeLog:
> >
> >PR target/96339
> >* config/aarch64/aarch64-sve-builtins-base.cc (svlast_impl::fold): 
> > Fold sve
> >calls that have a constant input predicate vector.
> >(svlast_impl::is_lasta): Query to check if intrinsic is svlasta.
> >(svlast_impl::is_lastb): Query to check if intrinsic is svlastb.
> >(svlast_impl::vect_all_same): Check if all vector elements are equal.
> >
> > gcc/testsuite/ChangeLog:
> >
> >PR target/96339
> >* gcc.target/aarch64/sve/acle/general-c/svlast.c: New.
> >* gcc.target/aarch64/sve/acle/general-c/svlast128_run.c: New.
> >* gcc.target/aarch64/sve/acle/general-c/svlast256_run.c: New.
> >* gcc.target/aarch64/sve/pcs/return_4.c (caller_bf16): Fix asm
> >to expect optimized code for function body.
> >* gcc.target/aarch64/sve/pcs/return_4_128.c (caller_bf16): Likewise.
> >* gcc.target/aarch64/sve/pcs/return_4_256.c (caller_bf16): Likewise.
> >* gcc.target/aarch64/sve/pcs/return_4_512.c (caller_bf16): Likewise.
> >* gcc.target/aarch64/sve/pcs/return_4_1024.c (caller_bf16): Likewise.
> >* gcc.target/aarch64/sve/pcs/return_4_2048.c (caller_bf16): Likewise.
> >* gcc.target/aarch64/sve/pcs/return_5.c (caller_bf16): Likewise.
> >* gcc.target/aarch64/sve/pcs/return_5_128.c (caller_bf16): Likewise.
> >* gcc.target/aarch64/sve/pcs/return_5_256.c (caller_bf16): Likewise.
> >* gcc.target/aarch64/sve/pcs/return_5_512.c (caller_bf16): Likewise.
> >* gcc.target/aarch64/sve/pcs/return_5_1024.c (caller_bf16): Likewise.
> >* gcc.target/aarch64/sve/pcs/return_5_2048.c (caller_bf16): Likewise.
>
> OK, thanks.
>
> Applied on master, thanks.
Hi Tejas,
This seems to break aarch64 bootstrap build with following error due
to -Wsign-compare diagnostic:
00:18:19 
/home/tcwg-buildslave/workspace/tcwg_gnu_6/abe/snapshots/gcc.git~master/gcc/config/aarch64/aarch64-sve-builtins-base.cc:1133:35:
error: comparison of integer expressions of different signedness:
‘int’ and ‘long unsigned int’ [-Werror=sign-compare]
00:18:19  1133 | for (i = npats; i < enelts; i += step_1)
00:18:19  | ~~^~~~
00:30:46 abe-debug-build: cc1plus: all warnings being treated as errors
00:30:46 abe-debug-build: make[3]: ***
[/home/tcwg-buildslave/workspace/tcwg_gnu_6/abe/snapshots/gcc.git~master/gcc/config/aarch64/t-aarch64:96:
aarch64-sve-builtins-base.o] Error 1

Thanks,
Prathamesh
>
> Tejas.
>
>
> Richard


Re: [aarch64] Code-gen for vector initialization involving constants

2023-06-12 Thread Prathamesh Kulkarni via Gcc-patches
On Wed, 31 May 2023 at 00:23, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > Hi Richard,
> > The s32 case for single constant patch doesn't regress now after the
> > above commit.
> > Bootstrapped+tested on aarch64-linux-gnu, and verified that the new
> > tests pass for aarch64_be-linux-gnu.
> > Is it OK to commit ?
> >
> > Thanks,
> > Prathamesh
> >
> > [aarch64] Improve code-gen for vector initialization with single constant 
> > element.
> >
> > gcc/ChangeLog:
> >   * config/aarch64/aarc64.cc (aarch64_expand_vector_init): Tweak 
> > condition
> >   if (n_var == n_elts && n_elts <= 16) to allow a single constant,
> >   and if maxv == 1, use constant element for duplicating into register.
> >
> > gcc/testsuite/ChangeLog:
> >   * gcc.target/aarch64/vec-init-single-const.c: New test.
> >   * gcc.target/aarch64/vec-init-single-const-be.c: Likewise.
> >   * gcc.target/aarch64/vec-init-single-const-2.c: Likewise.
>
> OK, thanks.
Hi Richard,
Sorry for the delay, I was away on vacation. Committed the patch after
rebasing on ToT, and verifying bootstrap+test passes on
aarch64-linux-gnu:
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=9eb757d11746c006c044ff45538b956be7f5859c

Thanks,
Prathamesh
>
> Richard
>
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index 5b046d32b37..30d6e3e8d83 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -22192,7 +22192,7 @@ aarch64_expand_vector_init_fallback (rtx target, 
> > rtx vals)
> >   and matches[X][1] with the count of duplicate elements (if X is the
> >   earliest element which has duplicates).  */
> >
> > -  if (n_var == n_elts && n_elts <= 16)
> > +  if (n_var >= n_elts - 1 && n_elts <= 16)
> >  {
> >int matches[16][2] = {0};
> >for (int i = 0; i < n_elts; i++)
> > @@ -22209,12 +22209,23 @@ aarch64_expand_vector_init_fallback (rtx target, 
> > rtx vals)
> >   }
> >int maxelement = 0;
> >int maxv = 0;
> > +  rtx const_elem = NULL_RTX;
> > +  int const_elem_pos = 0;
> > +
> >for (int i = 0; i < n_elts; i++)
> > - if (matches[i][1] > maxv)
> > -   {
> > - maxelement = i;
> > - maxv = matches[i][1];
> > -   }
> > + {
> > +   if (matches[i][1] > maxv)
> > + {
> > +   maxelement = i;
> > +   maxv = matches[i][1];
> > + }
> > +   if (CONST_INT_P (XVECEXP (vals, 0, i))
> > +   || CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
> > + {
> > +   const_elem_pos = i;
> > +   const_elem = XVECEXP (vals, 0, i);
> > + }
> > + }
> >
> >/* Create a duplicate of the most common element, unless all elements
> >are equally useless to us, in which case just immediately set the
> > @@ -22252,8 +22263,19 @@ aarch64_expand_vector_init_fallback (rtx target, 
> > rtx vals)
> >vector register.  For big-endian we want that position to hold
> >the last element of VALS.  */
> > maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
> > -   rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
> > -   aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
> > +
> > +   /* If we have a single constant element, use that for duplicating
> > +  instead.  */
> > +   if (const_elem)
> > + {
> > +   maxelement = const_elem_pos;
> > +   aarch64_emit_move (target, gen_vec_duplicate (mode, 
> > const_elem));
> > + }
> > +   else
> > + {
> > +   rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
> > +   aarch64_emit_move (target, lowpart_subreg (mode, x, 
> > inner_mode));
> > + }
> >   }
> >else
> >   {
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-single-const-2.c 
> > b/gcc/testsuite/gcc.target/aarch64/vec-init-single-const-2.c
> > new file mode 100644
> > index 000..f4dcab429c1
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-single-const-2.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +
> > +#include 
> > +
> > +/* In case where there are no duplicate elements in vec

Re: [committed] libstdc++: Fix P2510R3 "Formatting pointers" [PR110149]

2023-06-12 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 9 Jun 2023 at 17:41, Jonathan Wakely via Gcc-patches
 wrote:
>
> Tested powerpc64le-linux. Pushed to trunk.
Hi Jonathan,
This patch causes following regression on armv8l-unknown-linux-gnueabihf:
FAIL: std/format/functions/format.cc execution test
/home/tcwg-buildslave/workspace/tcwg_gnu_3/abe/snapshots/gcc.git~master/libstdc++-v3/testsuite/std/format/functions/format.cc:368:
void test_pointer(): Assertion 's == (str_int + ' ' + str_int + "
0x0")' failed.
timeout: the monitored command dumped core

Full libstdc++.log:
https://people.linaro.org/~prathamesh.kulkarni/libstdc++.log.0.xz
Could you please check ?

Thanks,
Prathamesh



>
> I'll backport it to gcc-13 later.
>
> -- >8 --
>
> I had intended to support the P2510R3 proposal unconditionally in C++20
> mode, but I left it half implemented. The parse function supported the
> new extensions, but the format function didn't.
>
> This adds the missing pieces, and makes it only enabled for C++26 and
> non-strict modes.
>
> libstdc++-v3/ChangeLog:
>
> PR libstdc++/110149
> * include/std/format (formatter::parse):
> Only alow 0 and P for C++26 and non-strict modes.
> (formatter::format): Use toupper for P
> type, and insert zero-fill characters for 0 option.
> * testsuite/std/format/functions/format.cc: Check pointer
> formatting. Only check P2510R3 extensions conditionally.
> * testsuite/std/format/parse_ctx.cc: Only check P2510R3
> extensions conditionally.
> ---
>  libstdc++-v3/include/std/format   | 56 ---
>  .../testsuite/std/format/functions/format.cc  | 42 ++
>  .../testsuite/std/format/parse_ctx.cc | 15 +++--
>  3 files changed, 101 insertions(+), 12 deletions(-)
>
> diff --git a/libstdc++-v3/include/std/format b/libstdc++-v3/include/std/format
> index 6edc3208afa..96a1e62ccc8 100644
> --- a/libstdc++-v3/include/std/format
> +++ b/libstdc++-v3/include/std/format
> @@ -830,7 +830,7 @@ namespace __format
> {
>   if (_M_spec._M_type == _Pres_esc)
> {
> - // TODO: C++20 escaped string presentation
> + // TODO: C++23 escaped string presentation
> }
>
>   if (_M_spec._M_width_kind == _WP_none
> @@ -2081,19 +2081,31 @@ namespace __format
> if (__finished())
>   return __first;
>
> -   // _GLIBCXX_RESOLVE_LIB_DEFECTS
> -   // P2519R3 Formatting pointers
> +// _GLIBCXX_RESOLVE_LIB_DEFECTS
> +// P2510R3 Formatting pointers
> +#define _GLIBCXX_P2518R3 (__cplusplus > 202302L || ! defined __STRICT_ANSI__)
> +
> +#if _GLIBCXX_P2518R3
> __first = __spec._M_parse_zero_fill(__first, __last);
> if (__finished())
>   return __first;
> +#endif
>
> __first = __spec._M_parse_width(__first, __last, __pc);
>
> -   if (__first != __last && (*__first == 'p' || *__first == 'P'))
> +   if (__first != __last)
>   {
> -   if (*__first == 'P')
> +   if (*__first == 'p')
> + ++__first;
> +#if _GLIBCXX_P2518R3
> +   else if (*__first == 'P')
> +   {
> + // _GLIBCXX_RESOLVE_LIB_DEFECTS
> + // P2510R3 Formatting pointers
>   __spec._M_type = __format::_Pres_P;
> -   ++__first;
> + ++__first;
> +   }
> +#endif
>   }
>
> if (__finished())
> @@ -2110,9 +2122,21 @@ namespace __format
>   char __buf[2 + sizeof(__v) * 2];
>   auto [__ptr, __ec] = std::to_chars(__buf + 2, std::end(__buf),
>  __u, 16);
> - const int __n = __ptr - __buf;
> + int __n = __ptr - __buf;
>   __buf[0] = '0';
>   __buf[1] = 'x';
> +#if _GLIBCXX_P2518R3
> + if (_M_spec._M_type == __format::_Pres_P)
> +   {
> + __buf[1] = 'X';
> + for (auto __p = __buf + 2; __p != __ptr; ++__p)
> +#if __has_builtin(__builtin_toupper)
> +   *__p = __builtin_toupper(*__p);
> +#else
> +   *__p = std::toupper(*__p);
> +#endif
> +   }
> +#endif
>
>   basic_string_view<_CharT> __str;
>   if constexpr (is_same_v<_CharT, char>)
> @@ -2126,6 +2150,24 @@ namespace __format
>   __str = wstring_view(__p, __n);
> }
>
> +#if _GLIBCXX_P2518R3
> + if (_M_spec._M_zero_fill)
> +   {
> + size_t __width = _M_spec._M_get_width(__fc);
> + if (__width <= __str.size())
> +   return __format::__write(__fc.out(), __str);
> +
> + auto __out = __fc.out();
> + // Write "0x" or "0X" prefix before zero-filling.
> + __out = __format::__write(std::move(__out), __str.substr(0, 2));
> + __str.remove_prefix(2);
> + size_t __nfill = __width - __n;
> + return __format::__write_padded(std::move(__out), __str,
> + 

Re: [aarch64] Code-gen for vector initialization involving constants

2023-05-25 Thread Prathamesh Kulkarni via Gcc-patches
On Thu, 25 May 2023 at 15:26, Prathamesh Kulkarni
 wrote:
>
> On Thu, 25 May 2023 at 13:04, Richard Sandiford
>  wrote:
> >
> > LGTM, just a couple of comment tweaks:
> >
> > Prathamesh Kulkarni  writes:
> > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > > index d6fc94015fa..db7ca4c28c3 100644
> > > --- a/gcc/config/aarch64/aarch64.cc
> > > +++ b/gcc/config/aarch64/aarch64.cc
> > > @@ -22332,6 +22332,46 @@ aarch64_unzip_vector_init (machine_mode mode, 
> > > rtx vals, bool even_p)
> > >return gen_rtx_PARALLEL (new_mode, vec);
> > >  }
> > >
> > > +/* Return true if INSN is a scalar move.  */
> >
> > s/INSN/SET/
> >
> > > +
> > > +static bool
> > > +scalar_move_insn_p (rtx set)
> > > +{
> > > +  rtx src = SET_SRC (set);
> > > +  rtx dest = SET_DEST (set);
> > > +  return (is_a (GET_MODE (dest))
> > > +   && aarch64_mov_operand (src, GET_MODE (dest)));
> > > +}
> > > +
> > > +/* Similar to seq_cost, but ignore cost for scalar moves.  This function
> > > +   is called from aarch64_expand_vector_init.  */
> >
> > Probably best to drop the second sentence.
> >
> > OK with those changes, thanks (no need to retest).
> Thanks, committed as ea9154dbc8fc86d4c617503ca5e6f02fed3a6a56.
Hi Richard,
The s32 case for single constant patch doesn't regress now after the
above commit.
Bootstrapped+tested on aarch64-linux-gnu, and verified that the new
tests pass for aarch64_be-linux-gnu.
Is it OK to commit ?

Thanks,
Prathamesh
>
> Thanks,
> Prathamesh
> >
> > Richard
[aarch64] Improve code-gen for vector initialization with single constant 
element.

gcc/ChangeLog:
* config/aarch64/aarc64.cc (aarch64_expand_vector_init): Tweak condition
if (n_var == n_elts && n_elts <= 16) to allow a single constant,
and if maxv == 1, use constant element for duplicating into register.

gcc/testsuite/ChangeLog:
* gcc.target/aarch64/vec-init-single-const.c: New test.
* gcc.target/aarch64/vec-init-single-const-be.c: Likewise.
* gcc.target/aarch64/vec-init-single-const-2.c: Likewise.

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 5b046d32b37..30d6e3e8d83 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -22192,7 +22192,7 @@ aarch64_expand_vector_init_fallback (rtx target, rtx 
vals)
  and matches[X][1] with the count of duplicate elements (if X is the
  earliest element which has duplicates).  */
 
-  if (n_var == n_elts && n_elts <= 16)
+  if (n_var >= n_elts - 1 && n_elts <= 16)
 {
   int matches[16][2] = {0};
   for (int i = 0; i < n_elts; i++)
@@ -22209,12 +22209,23 @@ aarch64_expand_vector_init_fallback (rtx target, rtx 
vals)
}
   int maxelement = 0;
   int maxv = 0;
+  rtx const_elem = NULL_RTX;
+  int const_elem_pos = 0;
+
   for (int i = 0; i < n_elts; i++)
-   if (matches[i][1] > maxv)
- {
-   maxelement = i;
-   maxv = matches[i][1];
- }
+   {
+ if (matches[i][1] > maxv)
+   {
+ maxelement = i;
+ maxv = matches[i][1];
+   }
+ if (CONST_INT_P (XVECEXP (vals, 0, i))
+ || CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
+   {
+ const_elem_pos = i;
+ const_elem = XVECEXP (vals, 0, i);
+   }
+   }
 
   /* Create a duplicate of the most common element, unless all elements
 are equally useless to us, in which case just immediately set the
@@ -22252,8 +22263,19 @@ aarch64_expand_vector_init_fallback (rtx target, rtx 
vals)
 vector register.  For big-endian we want that position to hold
 the last element of VALS.  */
  maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
- rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
- aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
+
+ /* If we have a single constant element, use that for duplicating
+instead.  */
+ if (const_elem)
+   {
+ maxelement = const_elem_pos;
+ aarch64_emit_move (target, gen_vec_duplicate (mode, const_elem));
+   }
+ else
+   {
+ rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+ aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
+   }
}
   else
{
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-single-const-2.c 
b/gcc/testsuite/gcc.target/aarch64/vec-init-single-const-2.c
new file mode 100644
index 000..f4dcab429c1
--- /dev/nul

Re: [aarch64] Code-gen for vector initialization involving constants

2023-05-25 Thread Prathamesh Kulkarni via Gcc-patches
On Thu, 25 May 2023 at 13:04, Richard Sandiford
 wrote:
>
> LGTM, just a couple of comment tweaks:
>
> Prathamesh Kulkarni  writes:
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index d6fc94015fa..db7ca4c28c3 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -22332,6 +22332,46 @@ aarch64_unzip_vector_init (machine_mode mode, rtx 
> > vals, bool even_p)
> >return gen_rtx_PARALLEL (new_mode, vec);
> >  }
> >
> > +/* Return true if INSN is a scalar move.  */
>
> s/INSN/SET/
>
> > +
> > +static bool
> > +scalar_move_insn_p (rtx set)
> > +{
> > +  rtx src = SET_SRC (set);
> > +  rtx dest = SET_DEST (set);
> > +  return (is_a (GET_MODE (dest))
> > +   && aarch64_mov_operand (src, GET_MODE (dest)));
> > +}
> > +
> > +/* Similar to seq_cost, but ignore cost for scalar moves.  This function
> > +   is called from aarch64_expand_vector_init.  */
>
> Probably best to drop the second sentence.
>
> OK with those changes, thanks (no need to retest).
Thanks, committed as ea9154dbc8fc86d4c617503ca5e6f02fed3a6a56.

Thanks,
Prathamesh
>
> Richard


Re: [aarch64] Code-gen for vector initialization involving constants

2023-05-25 Thread Prathamesh Kulkarni via Gcc-patches
On Thu, 25 May 2023 at 01:28, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Wed, 24 May 2023 at 15:40, Richard Sandiford
> >  wrote:
> >>
> >> Prathamesh Kulkarni  writes:
> >> > On Mon, 22 May 2023 at 14:18, Richard Sandiford
> >> >  wrote:
> >> >>
> >> >> Prathamesh Kulkarni  writes:
> >> >> > Hi Richard,
> >> >> > Thanks for the suggestions. Does the attached patch look OK ?
> >> >> > Boostrap+test in progress on aarch64-linux-gnu.
> >> >>
> >> >> Like I say, please wait for the tests to complete before sending an RFA.
> >> >> It saves a review cycle if the tests don't in fact pass.
> >> > Right, sorry, will post patches after completion of testing henceforth.
> >> >>
> >> >> > diff --git a/gcc/config/aarch64/aarch64.cc 
> >> >> > b/gcc/config/aarch64/aarch64.cc
> >> >> > index 29dbacfa917..e611a7cca25 100644
> >> >> > --- a/gcc/config/aarch64/aarch64.cc
> >> >> > +++ b/gcc/config/aarch64/aarch64.cc
> >> >> > @@ -22332,6 +22332,43 @@ aarch64_unzip_vector_init (machine_mode 
> >> >> > mode, rtx vals, bool even_p)
> >> >> >return gen_rtx_PARALLEL (new_mode, vec);
> >> >> >  }
> >> >> >
> >> >> > +/* Return true if INSN is a scalar move.  */
> >> >> > +
> >> >> > +static bool
> >> >> > +scalar_move_insn_p (const rtx_insn *insn)
> >> >> > +{
> >> >> > +  rtx set = single_set (insn);
> >> >> > +  if (!set)
> >> >> > +return false;
> >> >> > +  rtx src = SET_SRC (set);
> >> >> > +  rtx dest = SET_DEST (set);
> >> >> > +  return is_a(GET_MODE (dest))
> >> >> > +  && aarch64_mov_operand_p (src, GET_MODE (src));
> >> >>
> >> >> Formatting:
> >> >>
> >> >>   return (is_a(GET_MODE (dest))
> >> >>   && aarch64_mov_operand_p (src, GET_MODE (src)));
> >> >>
> >> >> OK with that change if the tests pass, thanks.
> >> > Unfortunately, the patch regressed vec-init-21.c:
> >> >
> >> > int8x16_t f_s8(int8_t x, int8_t y)
> >> > {
> >> >   return (int8x16_t) { x, y, 1, 2, 3, 4, 5, 6,
> >> >7, 8, 9, 10, 11, 12, 13, 14 };
> >> > }
> >> >
> >> > -O3 code-gen trunk:
> >> > f_s8:
> >> > adrpx2, .LC0
> >> > ldr q0, [x2, #:lo12:.LC0]
> >> > ins v0.b[0], w0
> >> > ins v0.b[1], w1
> >> > ret
> >> >
> >> > -O3 code-gen patch:
> >> > f_s8:
> >> > adrpx2, .LC0
> >> > ldr d31, [x2, #:lo12:.LC0]
> >> > adrpx2, .LC1
> >> > ldr d0, [x2, #:lo12:.LC1]
> >> > ins v31.b[0], w0
> >> > ins v0.b[0], w1
> >> > zip1v0.16b, v31.16b, v0.16b
> >> > ret
> >> >
> >> > With trunk, it chooses the fallback sequence because both fallback
> >> > and zip1 sequence had cost = 20, however with patch applied,
> >> > we end up with zip1 sequence cost = 24 and fallback sequence
> >> > cost = 28.
> >> >
> >> > This happens because of using insn_cost instead of
> >> > set_rtx_cost for the following expression:
> >> > (set (reg:QI 100)
> >> > (subreg/s/u:QI (reg/v:SI 94 [ y ]) 0))
> >> > set_rtx_cost returns 0 for above expression but insn_cost returns 4.
> >>
> >> Yeah, was wondering why you'd dropped the set_rtx_cost thing,
> >> but decided not to question it since using insn_cost seemed
> >> reasonable if it worked.
> > The attached patch uses set_rtx_cost for single_set and insn_cost
> > otherwise for non debug insns similar to seq_cost.
>
> FWIW, I think with the aarch64_mov_operand fix, the old way of using
> insn_cost for everything would have worked too.  But either way is fine.
>
> >> > This expression template appears twice in fallback sequence, which raises
> >> > the cost to 28 from 20, while it appears once in each half of zip1 
> >> > sequence,
> >> > which raises the cost to 24 

Re: [aarch64] Code-gen for vector initialization involving constants

2023-05-24 Thread Prathamesh Kulkarni via Gcc-patches
On Wed, 24 May 2023 at 15:40, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Mon, 22 May 2023 at 14:18, Richard Sandiford
> >  wrote:
> >>
> >> Prathamesh Kulkarni  writes:
> >> > Hi Richard,
> >> > Thanks for the suggestions. Does the attached patch look OK ?
> >> > Boostrap+test in progress on aarch64-linux-gnu.
> >>
> >> Like I say, please wait for the tests to complete before sending an RFA.
> >> It saves a review cycle if the tests don't in fact pass.
> > Right, sorry, will post patches after completion of testing henceforth.
> >>
> >> > diff --git a/gcc/config/aarch64/aarch64.cc 
> >> > b/gcc/config/aarch64/aarch64.cc
> >> > index 29dbacfa917..e611a7cca25 100644
> >> > --- a/gcc/config/aarch64/aarch64.cc
> >> > +++ b/gcc/config/aarch64/aarch64.cc
> >> > @@ -22332,6 +22332,43 @@ aarch64_unzip_vector_init (machine_mode mode, 
> >> > rtx vals, bool even_p)
> >> >return gen_rtx_PARALLEL (new_mode, vec);
> >> >  }
> >> >
> >> > +/* Return true if INSN is a scalar move.  */
> >> > +
> >> > +static bool
> >> > +scalar_move_insn_p (const rtx_insn *insn)
> >> > +{
> >> > +  rtx set = single_set (insn);
> >> > +  if (!set)
> >> > +return false;
> >> > +  rtx src = SET_SRC (set);
> >> > +  rtx dest = SET_DEST (set);
> >> > +  return is_a(GET_MODE (dest))
> >> > +  && aarch64_mov_operand_p (src, GET_MODE (src));
> >>
> >> Formatting:
> >>
> >>   return (is_a(GET_MODE (dest))
> >>   && aarch64_mov_operand_p (src, GET_MODE (src)));
> >>
> >> OK with that change if the tests pass, thanks.
> > Unfortunately, the patch regressed vec-init-21.c:
> >
> > int8x16_t f_s8(int8_t x, int8_t y)
> > {
> >   return (int8x16_t) { x, y, 1, 2, 3, 4, 5, 6,
> >7, 8, 9, 10, 11, 12, 13, 14 };
> > }
> >
> > -O3 code-gen trunk:
> > f_s8:
> > adrpx2, .LC0
> > ldr q0, [x2, #:lo12:.LC0]
> > ins v0.b[0], w0
> > ins v0.b[1], w1
> > ret
> >
> > -O3 code-gen patch:
> > f_s8:
> > adrpx2, .LC0
> > ldr d31, [x2, #:lo12:.LC0]
> > adrpx2, .LC1
> > ldr d0, [x2, #:lo12:.LC1]
> > ins v31.b[0], w0
> > ins v0.b[0], w1
> > zip1v0.16b, v31.16b, v0.16b
> > ret
> >
> > With trunk, it chooses the fallback sequence because both fallback
> > and zip1 sequence had cost = 20, however with patch applied,
> > we end up with zip1 sequence cost = 24 and fallback sequence
> > cost = 28.
> >
> > This happens because of using insn_cost instead of
> > set_rtx_cost for the following expression:
> > (set (reg:QI 100)
> > (subreg/s/u:QI (reg/v:SI 94 [ y ]) 0))
> > set_rtx_cost returns 0 for above expression but insn_cost returns 4.
>
> Yeah, was wondering why you'd dropped the set_rtx_cost thing,
> but decided not to question it since using insn_cost seemed
> reasonable if it worked.
[reposting because my reply got blocked for moderator approval]

The attached patch uses set_rtx_cost for single_set and insn_cost
otherwise for non debug insns similar to seq_cost.
>
> > This expression template appears twice in fallback sequence, which raises
> > the cost to 28 from 20, while it appears once in each half of zip1 sequence,
> > which raises the cost to 24 from 20, and so it now prefers zip1 sequence
> > instead.
> >
> > I assumed this expression would be ignored because it looks like a scalar 
> > move,
> > but that doesn't seem to be the case ?
> > aarch64_classify_symbolic_expression returns
> > SYMBOL_FORCE_TO_MEM for (subreg/s/u:QI (reg/v:SI 94 [ y ]) 0)
> > and thus aarch64_mov_operand_p returns false.
>
> Ah, I guess it should be aarch64_mov_operand instead.  Confusing that
> they're so different...
Thanks, using aarch64_mov_operand worked.
>
> > Another issue with the zip1 sequence above is using same register x2
> > for loading another half of constant in:
> > adrpx2, .LC1
> >
> > I guess this will create an output dependency from adrp x2, .LC0 ->
> > adrp x2, .LC1
> > and anti-dependency from  ldr d31, [x2, #:lo12:.LC0] -> adrp x2, .LC1
> > essentially forcing almost the entire sequence (except ins
> > instructions) to execute sequen

Re: [aarch64] Code-gen for vector initialization involving constants

2023-05-24 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 22 May 2023 at 14:18, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > Hi Richard,
> > Thanks for the suggestions. Does the attached patch look OK ?
> > Boostrap+test in progress on aarch64-linux-gnu.
>
> Like I say, please wait for the tests to complete before sending an RFA.
> It saves a review cycle if the tests don't in fact pass.
Right, sorry, will post patches after completion of testing henceforth.
>
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index 29dbacfa917..e611a7cca25 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -22332,6 +22332,43 @@ aarch64_unzip_vector_init (machine_mode mode, rtx 
> > vals, bool even_p)
> >return gen_rtx_PARALLEL (new_mode, vec);
> >  }
> >
> > +/* Return true if INSN is a scalar move.  */
> > +
> > +static bool
> > +scalar_move_insn_p (const rtx_insn *insn)
> > +{
> > +  rtx set = single_set (insn);
> > +  if (!set)
> > +return false;
> > +  rtx src = SET_SRC (set);
> > +  rtx dest = SET_DEST (set);
> > +  return is_a(GET_MODE (dest))
> > +  && aarch64_mov_operand_p (src, GET_MODE (src));
>
> Formatting:
>
>   return (is_a(GET_MODE (dest))
>   && aarch64_mov_operand_p (src, GET_MODE (src)));
>
> OK with that change if the tests pass, thanks.
Unfortunately, the patch regressed vec-init-21.c:

int8x16_t f_s8(int8_t x, int8_t y)
{
  return (int8x16_t) { x, y, 1, 2, 3, 4, 5, 6,
   7, 8, 9, 10, 11, 12, 13, 14 };
}

-O3 code-gen trunk:
f_s8:
adrpx2, .LC0
ldr q0, [x2, #:lo12:.LC0]
ins v0.b[0], w0
ins v0.b[1], w1
ret

-O3 code-gen patch:
f_s8:
adrpx2, .LC0
ldr d31, [x2, #:lo12:.LC0]
adrpx2, .LC1
ldr d0, [x2, #:lo12:.LC1]
ins v31.b[0], w0
ins v0.b[0], w1
zip1v0.16b, v31.16b, v0.16b
ret

With trunk, it chooses the fallback sequence because both fallback
and zip1 sequence had cost = 20, however with patch applied,
we end up with zip1 sequence cost = 24 and fallback sequence
cost = 28.

This happens because of using insn_cost instead of
set_rtx_cost for the following expression:
(set (reg:QI 100)
(subreg/s/u:QI (reg/v:SI 94 [ y ]) 0))
set_rtx_cost returns 0 for above expression but insn_cost returns 4.

This expression template appears twice in fallback sequence, which raises
the cost to 28 from 20, while it appears once in each half of zip1 sequence,
which raises the cost to 24 from 20, and so it now prefers zip1 sequence
instead.

I assumed this expression would be ignored because it looks like a scalar move,
but that doesn't seem to be the case ?
aarch64_classify_symbolic_expression returns
SYMBOL_FORCE_TO_MEM for (subreg/s/u:QI (reg/v:SI 94 [ y ]) 0)
and thus aarch64_mov_operand_p returns false.

Another issue with the zip1 sequence above is using same register x2
for loading another half of constant in:
adrpx2, .LC1

I guess this will create an output dependency from adrp x2, .LC0 ->
adrp x2, .LC1
and anti-dependency from  ldr d31, [x2, #:lo12:.LC0] -> adrp x2, .LC1
essentially forcing almost the entire sequence (except ins
instructions) to execute sequentially ?

Fallback sequence rtl, cost = 28
(set (reg:V16QI 96)
(const_vector:V16QI [
(const_int 7 [0x7])
(const_int 8 [0x8])
(const_int 1 [0x1])
(const_int 2 [0x2])
(const_int 3 [0x3])
(const_int 4 [0x4])
(const_int 5 [0x5])
(const_int 6 [0x6])
(const_int 7 [0x7])
(const_int 8 [0x8])
(const_int 9 [0x9])
(const_int 10 [0xa])
(const_int 11 [0xb])
(const_int 12 [0xc])
(const_int 13 [0xd])
(const_int 14 [0xe])
]))
cost = 12
(set (reg:QI 101)
(subreg/s/u:QI (reg/v:SI 93 [ x ]) 0))
cost = 4
(set (reg:V16QI 96)
(vec_merge:V16QI (vec_duplicate:V16QI (reg:QI 101))
(reg:V16QI 96)
(const_int 1 [0x1])))
cost = 4
(set (reg:QI 102)
(subreg/s/u:QI (reg/v:SI 94 [ y ]) 0))
cost = 4
(set (reg:V16QI 96)
(vec_merge:V16QI (vec_duplicate:V16QI (reg:QI 102))
(reg:V16QI 96)
(const_int 2 [0x2])))
cost = 4

zip1 sequence rtl, cost = 24
(set (reg:V8QI 97)
(const_vector:V8QI [
(const_int 7 [0x7])
(const_int 1 [0x1])
(const_int 3 [0x3])
(const_int 5 [0x5])
(const_int 7 [0x7])
(const_int 9 [0x9])
(const_int 11 [0xb])
(const_int 13 [0xd])
]))
cost = 12
(set (reg:QI 98)
(subreg/s/u:QI (reg/v:SI 93 [ x ]) 0))
cost = 4
(set (reg:V8QI 97)
(vec_merge:V8QI (vec_duplicate:V8QI (reg:Q

Re: [aarch64] Code-gen for vector initialization involving constants

2023-05-19 Thread Prathamesh Kulkarni via Gcc-patches
On Thu, 18 May 2023 at 22:04, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Thu, 18 May 2023 at 13:37, Richard Sandiford
> >  wrote:
> >>
> >> Prathamesh Kulkarni  writes:
> >> > On Tue, 16 May 2023 at 00:29, Richard Sandiford
> >> >  wrote:
> >> >>
> >> >> Prathamesh Kulkarni  writes:
> >> >> > Hi Richard,
> >> >> > After committing the interleave+zip1 patch for vector initialization,
> >> >> > it seems to regress the s32 case for this patch:
> >> >> >
> >> >> > int32x4_t f_s32(int32_t x)
> >> >> > {
> >> >> >   return (int32x4_t) { x, x, x, 1 };
> >> >> > }
> >> >> >
> >> >> > code-gen:
> >> >> > f_s32:
> >> >> > moviv30.2s, 0x1
> >> >> > fmovs31, w0
> >> >> > dup v0.2s, v31.s[0]
> >> >> > ins v30.s[0], v31.s[0]
> >> >> > zip1v0.4s, v0.4s, v30.4s
> >> >> > ret
> >> >> >
> >> >> > instead of expected code-gen:
> >> >> > f_s32:
> >> >> > moviv31.2s, 0x1
> >> >> > dup v0.4s, w0
> >> >> > ins v0.s[3], v31.s[0]
> >> >> > ret
> >> >> >
> >> >> > Cost for fallback sequence: 16
> >> >> > Cost for interleave and zip sequence: 12
> >> >> >
> >> >> > For the above case, the cost for interleave+zip1 sequence is computed 
> >> >> > as:
> >> >> > halves[0]:
> >> >> > (set (reg:V2SI 96)
> >> >> > (vec_duplicate:V2SI (reg/v:SI 93 [ x ])))
> >> >> > cost = 8
> >> >> >
> >> >> > halves[1]:
> >> >> > (set (reg:V2SI 97)
> >> >> > (const_vector:V2SI [
> >> >> > (const_int 1 [0x1]) repeated x2
> >> >> > ]))
> >> >> > (set (reg:V2SI 97)
> >> >> > (vec_merge:V2SI (vec_duplicate:V2SI (reg/v:SI 93 [ x ]))
> >> >> > (reg:V2SI 97)
> >> >> > (const_int 1 [0x1])))
> >> >> > cost = 8
> >> >> >
> >> >> > followed by:
> >> >> > (set (reg:V4SI 95)
> >> >> > (unspec:V4SI [
> >> >> > (subreg:V4SI (reg:V2SI 96) 0)
> >> >> > (subreg:V4SI (reg:V2SI 97) 0)
> >> >> > ] UNSPEC_ZIP1))
> >> >> > cost = 4
> >> >> >
> >> >> > So the total cost becomes
> >> >> > max(costs[0], costs[1]) + zip1_insn_cost
> >> >> > = max(8, 8) + 4
> >> >> > = 12
> >> >> >
> >> >> > While the fallback rtl sequence is:
> >> >> > (set (reg:V4SI 95)
> >> >> > (vec_duplicate:V4SI (reg/v:SI 93 [ x ])))
> >> >> > cost = 8
> >> >> > (set (reg:SI 98)
> >> >> > (const_int 1 [0x1]))
> >> >> > cost = 4
> >> >> > (set (reg:V4SI 95)
> >> >> > (vec_merge:V4SI (vec_duplicate:V4SI (reg:SI 98))
> >> >> > (reg:V4SI 95)
> >> >> > (const_int 8 [0x8])))
> >> >> > cost = 4
> >> >> >
> >> >> > So total cost = 8 + 4 + 4 = 16, and we choose the interleave+zip1 
> >> >> > sequence.
> >> >> >
> >> >> > I think the issue is probably that for the interleave+zip1 sequence 
> >> >> > we take
> >> >> > max(costs[0], costs[1]) to reflect that both halves are interleaved,
> >> >> > but for the fallback seq we use seq_cost, which assumes serial 
> >> >> > execution
> >> >> > of insns in the sequence.
> >> >> > For above fallback sequence,
> >> >> > set (reg:V4SI 95)
> >> >> > (vec_duplicate:V4SI (reg/v:SI 93 [ x ])))
> >> >> > and
> >> >> > (set (reg:SI 98)
> >> >> > (const_int 1 [0x1]))
> >> >> > could be executed in parallel, which would make it's cost max(8, 4) + 
> >> >> > 4 = 12.
> >&

Re: [aarch64] Code-gen for vector initialization involving constants

2023-05-18 Thread Prathamesh Kulkarni via Gcc-patches
On Thu, 18 May 2023 at 13:37, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Tue, 16 May 2023 at 00:29, Richard Sandiford
> >  wrote:
> >>
> >> Prathamesh Kulkarni  writes:
> >> > Hi Richard,
> >> > After committing the interleave+zip1 patch for vector initialization,
> >> > it seems to regress the s32 case for this patch:
> >> >
> >> > int32x4_t f_s32(int32_t x)
> >> > {
> >> >   return (int32x4_t) { x, x, x, 1 };
> >> > }
> >> >
> >> > code-gen:
> >> > f_s32:
> >> > moviv30.2s, 0x1
> >> > fmovs31, w0
> >> > dup v0.2s, v31.s[0]
> >> > ins v30.s[0], v31.s[0]
> >> > zip1v0.4s, v0.4s, v30.4s
> >> > ret
> >> >
> >> > instead of expected code-gen:
> >> > f_s32:
> >> > moviv31.2s, 0x1
> >> > dup v0.4s, w0
> >> > ins v0.s[3], v31.s[0]
> >> > ret
> >> >
> >> > Cost for fallback sequence: 16
> >> > Cost for interleave and zip sequence: 12
> >> >
> >> > For the above case, the cost for interleave+zip1 sequence is computed as:
> >> > halves[0]:
> >> > (set (reg:V2SI 96)
> >> > (vec_duplicate:V2SI (reg/v:SI 93 [ x ])))
> >> > cost = 8
> >> >
> >> > halves[1]:
> >> > (set (reg:V2SI 97)
> >> > (const_vector:V2SI [
> >> > (const_int 1 [0x1]) repeated x2
> >> > ]))
> >> > (set (reg:V2SI 97)
> >> > (vec_merge:V2SI (vec_duplicate:V2SI (reg/v:SI 93 [ x ]))
> >> > (reg:V2SI 97)
> >> > (const_int 1 [0x1])))
> >> > cost = 8
> >> >
> >> > followed by:
> >> > (set (reg:V4SI 95)
> >> > (unspec:V4SI [
> >> > (subreg:V4SI (reg:V2SI 96) 0)
> >> > (subreg:V4SI (reg:V2SI 97) 0)
> >> > ] UNSPEC_ZIP1))
> >> > cost = 4
> >> >
> >> > So the total cost becomes
> >> > max(costs[0], costs[1]) + zip1_insn_cost
> >> > = max(8, 8) + 4
> >> > = 12
> >> >
> >> > While the fallback rtl sequence is:
> >> > (set (reg:V4SI 95)
> >> > (vec_duplicate:V4SI (reg/v:SI 93 [ x ])))
> >> > cost = 8
> >> > (set (reg:SI 98)
> >> > (const_int 1 [0x1]))
> >> > cost = 4
> >> > (set (reg:V4SI 95)
> >> > (vec_merge:V4SI (vec_duplicate:V4SI (reg:SI 98))
> >> > (reg:V4SI 95)
> >> > (const_int 8 [0x8])))
> >> > cost = 4
> >> >
> >> > So total cost = 8 + 4 + 4 = 16, and we choose the interleave+zip1 
> >> > sequence.
> >> >
> >> > I think the issue is probably that for the interleave+zip1 sequence we 
> >> > take
> >> > max(costs[0], costs[1]) to reflect that both halves are interleaved,
> >> > but for the fallback seq we use seq_cost, which assumes serial execution
> >> > of insns in the sequence.
> >> > For above fallback sequence,
> >> > set (reg:V4SI 95)
> >> > (vec_duplicate:V4SI (reg/v:SI 93 [ x ])))
> >> > and
> >> > (set (reg:SI 98)
> >> > (const_int 1 [0x1]))
> >> > could be executed in parallel, which would make it's cost max(8, 4) + 4 
> >> > = 12.
> >>
> >> Agreed.
> >>
> >> A good-enough substitute for this might be to ignore scalar moves
> >> (for both alternatives) when costing for speed.
> > Thanks for the suggestions. Just wondering for aarch64, if there's an easy
> > way we can check if insn is a scalar move, similar to riscv's 
> > scalar_move_insn_p
> > that checks if get_attr_type(insn) is TYPE_VIMOVXV or TYPE_VFMOVFV ?
>
> It should be enough to check that the pattern is a SET:
>
> (a) whose SET_DEST has a scalar mode and
> (b) whose SET_SRC an aarch64_mov_operand
Hi Richard,
Thanks for the suggestions, the attached patch calls seq_cost to compute
cost for sequence and then subtracts cost of each scalar move insn from it.
Does that look OK ?
The patch is under bootstrap+test on aarch64-linux-gnu.

After applying the single-constant case patch on top, the cost of fallback
sequence is now reduced to 12 instead of 16:
Cost before ignoring scalar moves: 16
I

Re: [aarch64] Code-gen for vector initialization involving constants

2023-05-17 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 16 May 2023 at 00:29, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > Hi Richard,
> > After committing the interleave+zip1 patch for vector initialization,
> > it seems to regress the s32 case for this patch:
> >
> > int32x4_t f_s32(int32_t x)
> > {
> >   return (int32x4_t) { x, x, x, 1 };
> > }
> >
> > code-gen:
> > f_s32:
> > moviv30.2s, 0x1
> > fmovs31, w0
> > dup v0.2s, v31.s[0]
> > ins v30.s[0], v31.s[0]
> > zip1v0.4s, v0.4s, v30.4s
> > ret
> >
> > instead of expected code-gen:
> > f_s32:
> > moviv31.2s, 0x1
> > dup v0.4s, w0
> > ins v0.s[3], v31.s[0]
> > ret
> >
> > Cost for fallback sequence: 16
> > Cost for interleave and zip sequence: 12
> >
> > For the above case, the cost for interleave+zip1 sequence is computed as:
> > halves[0]:
> > (set (reg:V2SI 96)
> > (vec_duplicate:V2SI (reg/v:SI 93 [ x ])))
> > cost = 8
> >
> > halves[1]:
> > (set (reg:V2SI 97)
> > (const_vector:V2SI [
> > (const_int 1 [0x1]) repeated x2
> > ]))
> > (set (reg:V2SI 97)
> > (vec_merge:V2SI (vec_duplicate:V2SI (reg/v:SI 93 [ x ]))
> > (reg:V2SI 97)
> > (const_int 1 [0x1])))
> > cost = 8
> >
> > followed by:
> > (set (reg:V4SI 95)
> > (unspec:V4SI [
> > (subreg:V4SI (reg:V2SI 96) 0)
> > (subreg:V4SI (reg:V2SI 97) 0)
> > ] UNSPEC_ZIP1))
> > cost = 4
> >
> > So the total cost becomes
> > max(costs[0], costs[1]) + zip1_insn_cost
> > = max(8, 8) + 4
> > = 12
> >
> > While the fallback rtl sequence is:
> > (set (reg:V4SI 95)
> > (vec_duplicate:V4SI (reg/v:SI 93 [ x ])))
> > cost = 8
> > (set (reg:SI 98)
> > (const_int 1 [0x1]))
> > cost = 4
> > (set (reg:V4SI 95)
> > (vec_merge:V4SI (vec_duplicate:V4SI (reg:SI 98))
> > (reg:V4SI 95)
> > (const_int 8 [0x8])))
> > cost = 4
> >
> > So total cost = 8 + 4 + 4 = 16, and we choose the interleave+zip1 sequence.
> >
> > I think the issue is probably that for the interleave+zip1 sequence we take
> > max(costs[0], costs[1]) to reflect that both halves are interleaved,
> > but for the fallback seq we use seq_cost, which assumes serial execution
> > of insns in the sequence.
> > For above fallback sequence,
> > set (reg:V4SI 95)
> > (vec_duplicate:V4SI (reg/v:SI 93 [ x ])))
> > and
> > (set (reg:SI 98)
> > (const_int 1 [0x1]))
> > could be executed in parallel, which would make it's cost max(8, 4) + 4 = 
> > 12.
>
> Agreed.
>
> A good-enough substitute for this might be to ignore scalar moves
> (for both alternatives) when costing for speed.
Thanks for the suggestions. Just wondering for aarch64, if there's an easy
way we can check if insn is a scalar move, similar to riscv's scalar_move_insn_p
that checks if get_attr_type(insn) is TYPE_VIMOVXV or TYPE_VFMOVFV ?
>
> > I was wondering if we should we make cost for interleave+zip1 sequence
> > more conservative
> > by not taking max, but summing up costs[0] + costs[1] even for speed ?
> > For this case,
> > that would be 8 + 8 + 4 = 20.
> >
> > It generates the fallback sequence for other cases (s8, s16, s64) from
> > the test-case.
>
> What does it do for the tests in the interleave+zip1 patch?  If it doesn't
> make a difference there then it sounds like we don't have enough tests. :)
Oh right, the tests in interleave+zip1 patch only check for s16 case,
sorry about that :/
Looking briefly at the code generated for s8, s32 and s64 case,
(a) s8, and s16 seem to use same sequence for all cases.
(b) s64 seems to use fallback sequence.
(c) For vec-init-21.c, s8 and s16 cases prefer fallback sequence
because costs are tied,
while s32 case prefers interleave+zip1:

int32x4_t f_s32(int32_t x, int32_t y)
{
  return (int32x4_t) { x, y, 1, 2 };
}

Code-gen with interleave+zip1 sequence:
f_s32:
moviv31.2s, 0x1
moviv0.2s, 0x2
ins v31.s[0], w0
ins v0.s[0], w1
zip1v0.4s, v31.4s, v0.4s
ret

Code-gen with fallback sequence:
f_s32:
adrpx2, .LC0
ldr q0, [x2, #:lo12:.LC0]
ins v0.s[0], w0
ins v0.s[1], w1
ret

Fallback sequence cost = 20
interleave+zip1 sequence cost = 12
I assume interleave+zip1 sequence is better in this case (chosen currently) ?

I will send a patch to add cases for s8, s16 and s64 in a foll

Re: [aarch64] Code-gen for vector initialization involving constants

2023-05-15 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 12 May 2023 at 00:45, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
>
> > On Tue, 2 May 2023 at 18:22, Richard Sandiford
> >  wrote:
> >>
> >> Prathamesh Kulkarni  writes:
> >> > On Tue, 2 May 2023 at 17:32, Richard Sandiford
> >> >  wrote:
> >> >>
> >> >> Prathamesh Kulkarni  writes:
> >> >> > On Tue, 2 May 2023 at 14:56, Richard Sandiford
> >> >> >  wrote:
> >> >> >> > [aarch64] Improve code-gen for vector initialization with single 
> >> >> >> > constant element.
> >> >> >> >
> >> >> >> > gcc/ChangeLog:
> >> >> >> >   * config/aarch64/aarc64.cc (aarch64_expand_vector_init): 
> >> >> >> > Tweak condition
> >> >> >> >   if (n_var == n_elts && n_elts <= 16) to allow a single 
> >> >> >> > constant,
> >> >> >> >   and if maxv == 1, use constant element for duplicating into 
> >> >> >> > register.
> >> >> >> >
> >> >> >> > gcc/testsuite/ChangeLog:
> >> >> >> >   * gcc.target/aarch64/vec-init-single-const.c: New test.
> >> >> >> >
> >> >> >> > diff --git a/gcc/config/aarch64/aarch64.cc 
> >> >> >> > b/gcc/config/aarch64/aarch64.cc
> >> >> >> > index 2b0de7ca038..f46750133a6 100644
> >> >> >> > --- a/gcc/config/aarch64/aarch64.cc
> >> >> >> > +++ b/gcc/config/aarch64/aarch64.cc
> >> >> >> > @@ -22167,7 +22167,7 @@ aarch64_expand_vector_init (rtx target, 
> >> >> >> > rtx vals)
> >> >> >> >   and matches[X][1] with the count of duplicate elements (if X 
> >> >> >> > is the
> >> >> >> >   earliest element which has duplicates).  */
> >> >> >> >
> >> >> >> > -  if (n_var == n_elts && n_elts <= 16)
> >> >> >> > +  if ((n_var >= n_elts - 1) && n_elts <= 16)
> >> >> >> >  {
> >> >> >> >int matches[16][2] = {0};
> >> >> >> >for (int i = 0; i < n_elts; i++)
> >> >> >> > @@ -7,6 +7,18 @@ aarch64_expand_vector_init (rtx target, 
> >> >> >> > rtx vals)
> >> >> >> >vector register.  For big-endian we want that position 
> >> >> >> > to hold
> >> >> >> >the last element of VALS.  */
> >> >> >> > maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
> >> >> >> > +
> >> >> >> > +   /* If we have a single constant element, use that for 
> >> >> >> > duplicating
> >> >> >> > +  instead.  */
> >> >> >> > +   if (n_var == n_elts - 1)
> >> >> >> > + for (int i = 0; i < n_elts; i++)
> >> >> >> > +   if (CONST_INT_P (XVECEXP (vals, 0, i))
> >> >> >> > +   || CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
> >> >> >> > + {
> >> >> >> > +   maxelement = i;
> >> >> >> > +   break;
> >> >> >> > + }
> >> >> >> > +
> >> >> >> > rtx x = force_reg (inner_mode, XVECEXP (vals, 0, 
> >> >> >> > maxelement));
> >> >> >> > aarch64_emit_move (target, lowpart_subreg (mode, x, 
> >> >> >> > inner_mode));
> >> >> >>
> >> >> >> We don't want to force the constant into a register though.
> >> >> > OK right, sorry.
> >> >> > With the attached patch, for the following test-case:
> >> >> > int64x2_t f_s64(int64_t x)
> >> >> > {
> >> >> >   return (int64x2_t) { x, 1 };
> >> >> > }
> >> >> >
> >> >> > it loads constant from memory (same code-gen as without patch).
> >> >> > f_s64:
> >> >> > adrpx1, .LC0
> >> >> > ldr q0, [x1, #:lo12:.LC0]
> >> >> > ins v0

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-05-13 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 12 May 2023 at 00:37, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-18.c 
> > b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> > new file mode 100644
> > index 000..598a51f17c6
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> > @@ -0,0 +1,20 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +
> > +#include 
> > +
> > +int16x8_t foo(int16_t x, int y)
> > +{
> > +  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y};
> > +  return v;
> > +}
> > +
> > +int16x8_t foo2(int16_t x)
> > +{
> > +  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1};
> > +  return v;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4h, w[0-9]+} 3 } } */
> > +/* { dg-final { scan-assembler {\tmovi\tv[0-9]+\.4h, 0x1} } } */
> > +/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.8h, v[0-9]+\.8h, 
> > v[0-9]+\.8h} } } */
>
> Would be good to make this a scan-assembler-times ... 2.
>
> OK with that change.  Thanks for doing this.
Thanks, committed the patch in:
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=8b18714fbb1ca9812b33b3de75fe6ba4a57d4946
after bootstrap+test on aarch64-linux-gnu, and verifying bootstrap
passes on aarch64-linux-gnu with --enable-checking=all.

Thanks,
Prathamesh
>
> Richard


Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-05-04 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 24 Apr 2023 at 15:00, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > [aarch64] Recursively intialize even and odd sub-parts and merge with zip1.
> >
> > gcc/ChangeLog:
> >   * config/aarch64/aarch64.cc (aarch64_expand_vector_init_fallback): 
> > Rename
> >   aarch64_expand_vector_init to this, and remove  interleaving case.
> >   Recursively call aarch64_expand_vector_init_fallback, instead of
> >   aarch64_expand_vector_init.
> >   (aarch64_unzip_vector_init): New function.
> >   (aarch64_expand_vector_init): Likewise.
> >
> > gcc/testsuite/ChangeLog:
> >   * gcc.target/aarch64/ldp_stp_16.c (cons2_8_float): Adjust for new
> >   code-gen.
> >   * gcc.target/aarch64/sve/acle/general/dupq_5.c: Likewise.
> >   * gcc.target/aarch64/sve/acle/general/dupq_6.c: Likewise.
> >   * gcc.target/aarch64/vec-init-18.c: Rename interleave-init-1.c to
> >   this.
> >   * gcc.target/aarch64/vec-init-19.c: New test.
> >   * gcc.target/aarch64/vec-init-20.c: Likewise.
> >   * gcc.target/aarch64/vec-init-21.c: Likewise.
> >   * gcc.target/aarch64/vec-init-22-size.c: Likewise.
> >   * gcc.target/aarch64/vec-init-22-speed.c: Likewise.
> >   * gcc.target/aarch64/vec-init-22.h: New header.
> >
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index d7e895f8d34..416e062829c 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -22026,11 +22026,12 @@ aarch64_simd_make_constant (rtx vals)
> >  return NULL_RTX;
> >  }
> >
> > -/* Expand a vector initialisation sequence, such that TARGET is
> > -   initialised to contain VALS.  */
> > +/* A subroutine of aarch64_expand_vector_init, with the same interface.
> > +   The caller has already tried a divide-and-conquer approach, so do
> > +   not consider that case here.  */
> >
> >  void
> > -aarch64_expand_vector_init (rtx target, rtx vals)
> > +aarch64_expand_vector_init_fallback (rtx target, rtx vals)
> >  {
> >machine_mode mode = GET_MODE (target);
> >scalar_mode inner_mode = GET_MODE_INNER (mode);
> > @@ -22090,38 +22091,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >return;
> >  }
> >
> > -  /* Check for interleaving case.
> > - For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> > - Generate following code:
> > - dup v0.h, x
> > - dup v1.h, y
> > - zip1 v0.h, v0.h, v1.h
> > - for "large enough" initializer.  */
> > -
> > -  if (n_elts >= 8)
> > -{
> > -  int i;
> > -  for (i = 2; i < n_elts; i++)
> > - if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> > -   break;
> > -
> > -  if (i == n_elts)
> > - {
> > -   machine_mode mode = GET_MODE (target);
> > -   rtx dest[2];
> > -
> > -   for (int i = 0; i < 2; i++)
> > - {
> > -   rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
> > -   dest[i] = force_reg (mode, x);
> > - }
> > -
> > -   rtvec v = gen_rtvec (2, dest[0], dest[1]);
> > -   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> > -   return;
> > - }
> > -}
> > -
> >enum insn_code icode = optab_handler (vec_set_optab, mode);
> >gcc_assert (icode != CODE_FOR_nothing);
> >
> > @@ -22243,7 +22212,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >   }
> > XVECEXP (copy, 0, i) = subst;
> >   }
> > -  aarch64_expand_vector_init (target, copy);
> > +  aarch64_expand_vector_init_fallback (target, copy);
> >  }
> >
> >/* Insert the variable lanes directly.  */
> > @@ -22257,6 +6,81 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >  }
> >  }
> >
> > +/* Return even or odd half of VALS depending on EVEN_P.  */
> > +
> > +static rtx
> > +aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
> > +{
> > +  int n = XVECLEN (vals, 0);
> > +  machine_mode new_mode
> > += aarch64_simd_container_mode (GET_MODE_INNER (mode),
> > +GET_MODE_BITSIZE (mode).to_constant () / 
> > 2);
> > +  rtvec vec = rtvec_alloc (n / 2);
> > +  for (int i = 0; i < n/2; i++)
>
> Formatting nit: n / 2
>
> > +RTVEC_ELT (vec

Re: [aarch64] Code-gen for vector initialization involving constants

2023-05-03 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 2 May 2023 at 18:22, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Tue, 2 May 2023 at 17:32, Richard Sandiford
> >  wrote:
> >>
> >> Prathamesh Kulkarni  writes:
> >> > On Tue, 2 May 2023 at 14:56, Richard Sandiford
> >> >  wrote:
> >> >> > [aarch64] Improve code-gen for vector initialization with single 
> >> >> > constant element.
> >> >> >
> >> >> > gcc/ChangeLog:
> >> >> >   * config/aarch64/aarc64.cc (aarch64_expand_vector_init): Tweak 
> >> >> > condition
> >> >> >   if (n_var == n_elts && n_elts <= 16) to allow a single constant,
> >> >> >   and if maxv == 1, use constant element for duplicating into 
> >> >> > register.
> >> >> >
> >> >> > gcc/testsuite/ChangeLog:
> >> >> >   * gcc.target/aarch64/vec-init-single-const.c: New test.
> >> >> >
> >> >> > diff --git a/gcc/config/aarch64/aarch64.cc 
> >> >> > b/gcc/config/aarch64/aarch64.cc
> >> >> > index 2b0de7ca038..f46750133a6 100644
> >> >> > --- a/gcc/config/aarch64/aarch64.cc
> >> >> > +++ b/gcc/config/aarch64/aarch64.cc
> >> >> > @@ -22167,7 +22167,7 @@ aarch64_expand_vector_init (rtx target, rtx 
> >> >> > vals)
> >> >> >   and matches[X][1] with the count of duplicate elements (if X is 
> >> >> > the
> >> >> >   earliest element which has duplicates).  */
> >> >> >
> >> >> > -  if (n_var == n_elts && n_elts <= 16)
> >> >> > +  if ((n_var >= n_elts - 1) && n_elts <= 16)
> >> >> >  {
> >> >> >int matches[16][2] = {0};
> >> >> >for (int i = 0; i < n_elts; i++)
> >> >> > @@ -7,6 +7,18 @@ aarch64_expand_vector_init (rtx target, rtx 
> >> >> > vals)
> >> >> >vector register.  For big-endian we want that position to 
> >> >> > hold
> >> >> >the last element of VALS.  */
> >> >> > maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
> >> >> > +
> >> >> > +   /* If we have a single constant element, use that for 
> >> >> > duplicating
> >> >> > +  instead.  */
> >> >> > +   if (n_var == n_elts - 1)
> >> >> > + for (int i = 0; i < n_elts; i++)
> >> >> > +   if (CONST_INT_P (XVECEXP (vals, 0, i))
> >> >> > +   || CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
> >> >> > + {
> >> >> > +   maxelement = i;
> >> >> > +   break;
> >> >> > + }
> >> >> > +
> >> >> > rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
> >> >> > aarch64_emit_move (target, lowpart_subreg (mode, x, 
> >> >> > inner_mode));
> >> >>
> >> >> We don't want to force the constant into a register though.
> >> > OK right, sorry.
> >> > With the attached patch, for the following test-case:
> >> > int64x2_t f_s64(int64_t x)
> >> > {
> >> >   return (int64x2_t) { x, 1 };
> >> > }
> >> >
> >> > it loads constant from memory (same code-gen as without patch).
> >> > f_s64:
> >> > adrpx1, .LC0
> >> > ldr q0, [x1, #:lo12:.LC0]
> >> > ins v0.d[0], x0
> >> > ret
> >> >
> >> > Does the patch look OK ?
> >> >
> >> > Thanks,
> >> > Prathamesh
> >> > [...]
> >> > [aarch64] Improve code-gen for vector initialization with single 
> >> > constant element.
> >> >
> >> > gcc/ChangeLog:
> >> >   * config/aarch64/aarc64.cc (aarch64_expand_vector_init): Tweak 
> >> > condition
> >> >   if (n_var == n_elts && n_elts <= 16) to allow a single constant,
> >> >   and if maxv == 1, use constant element for duplicating into 
> >> > register.
> >> >
> >> > gcc/testsuite/ChangeLog:
> >> >   * gcc.target/aarch64/vec-init-single-c

Re: [aarch64] Code-gen for vector initialization involving constants

2023-05-02 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 2 May 2023 at 17:32, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Tue, 2 May 2023 at 14:56, Richard Sandiford
> >  wrote:
> >> > [aarch64] Improve code-gen for vector initialization with single 
> >> > constant element.
> >> >
> >> > gcc/ChangeLog:
> >> >   * config/aarch64/aarc64.cc (aarch64_expand_vector_init): Tweak 
> >> > condition
> >> >   if (n_var == n_elts && n_elts <= 16) to allow a single constant,
> >> >   and if maxv == 1, use constant element for duplicating into 
> >> > register.
> >> >
> >> > gcc/testsuite/ChangeLog:
> >> >   * gcc.target/aarch64/vec-init-single-const.c: New test.
> >> >
> >> > diff --git a/gcc/config/aarch64/aarch64.cc 
> >> > b/gcc/config/aarch64/aarch64.cc
> >> > index 2b0de7ca038..f46750133a6 100644
> >> > --- a/gcc/config/aarch64/aarch64.cc
> >> > +++ b/gcc/config/aarch64/aarch64.cc
> >> > @@ -22167,7 +22167,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >> >   and matches[X][1] with the count of duplicate elements (if X is the
> >> >   earliest element which has duplicates).  */
> >> >
> >> > -  if (n_var == n_elts && n_elts <= 16)
> >> > +  if ((n_var >= n_elts - 1) && n_elts <= 16)
> >> >  {
> >> >int matches[16][2] = {0};
> >> >for (int i = 0; i < n_elts; i++)
> >> > @@ -7,6 +7,18 @@ aarch64_expand_vector_init (rtx target, rtx 
> >> > vals)
> >> >vector register.  For big-endian we want that position to hold
> >> >the last element of VALS.  */
> >> > maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
> >> > +
> >> > +   /* If we have a single constant element, use that for duplicating
> >> > +  instead.  */
> >> > +   if (n_var == n_elts - 1)
> >> > + for (int i = 0; i < n_elts; i++)
> >> > +   if (CONST_INT_P (XVECEXP (vals, 0, i))
> >> > +   || CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
> >> > + {
> >> > +   maxelement = i;
> >> > +   break;
> >> > + }
> >> > +
> >> > rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
> >> > aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
> >>
> >> We don't want to force the constant into a register though.
> > OK right, sorry.
> > With the attached patch, for the following test-case:
> > int64x2_t f_s64(int64_t x)
> > {
> >   return (int64x2_t) { x, 1 };
> > }
> >
> > it loads constant from memory (same code-gen as without patch).
> > f_s64:
> > adrpx1, .LC0
> > ldr q0, [x1, #:lo12:.LC0]
> > ins v0.d[0], x0
> > ret
> >
> > Does the patch look OK ?
> >
> > Thanks,
> > Prathamesh
> > [...]
> > [aarch64] Improve code-gen for vector initialization with single constant 
> > element.
> >
> > gcc/ChangeLog:
> >   * config/aarch64/aarc64.cc (aarch64_expand_vector_init): Tweak 
> > condition
> >   if (n_var == n_elts && n_elts <= 16) to allow a single constant,
> >   and if maxv == 1, use constant element for duplicating into register.
> >
> > gcc/testsuite/ChangeLog:
> >   * gcc.target/aarch64/vec-init-single-const.c: New test.
> >
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index 2b0de7ca038..97309ddec4f 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -22167,7 +22167,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >   and matches[X][1] with the count of duplicate elements (if X is the
> >   earliest element which has duplicates).  */
> >
> > -  if (n_var == n_elts && n_elts <= 16)
> > +  if ((n_var >= n_elts - 1) && n_elts <= 16)
>
> No need for the extra brackets.
Adjusted, thanks. Sorry if this sounds like a silly question, but why
do we need the n_elts <= 16 check ?
Won't n_elts be always <= 16 since max number of elements in a vector
would be 16 for V16QI ?
>
> >  {
> >int matches[16][2] = {0};
> >for (int i = 0; i < n_elts; i++)
> >

Re: [aarch64] Code-gen for vector initialization involving constants

2023-05-02 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 2 May 2023 at 14:56, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Tue, 25 Apr 2023 at 16:29, Richard Sandiford
> >  wrote:
> >>
> >> Prathamesh Kulkarni  writes:
> >> > Hi Richard,
> >> > While digging thru aarch64_expand_vector_init, I noticed it gives
> >> > priority to loading a constant first:
> >> >  /* Initialise a vector which is part-variable.  We want to first try
> >> >  to build those lanes which are constant in the most efficient way we
> >> >  can.  */
> >> >
> >> > which results in suboptimal code-gen for following case:
> >> > int16x8_t f_s16(int16_t x)
> >> > {
> >> >   return (int16x8_t) { x, x, x, x, x, x, x, 1 };
> >> > }
> >> >
> >> > code-gen trunk:
> >> > f_s16:
> >> > moviv0.8h, 0x1
> >> > ins v0.h[0], w0
> >> > ins v0.h[1], w0
> >> > ins v0.h[2], w0
> >> > ins v0.h[3], w0
> >> > ins v0.h[4], w0
> >> > ins v0.h[5], w0
> >> > ins v0.h[6], w0
> >> > ret
> >> >
> >> > The attached patch tweaks the following condition:
> >> > if (n_var == n_elts && n_elts <= 16)
> >> >   {
> >> > ...
> >> >   }
> >> >
> >> > to pass if maxv >= 80% of n_elts, with 80% being an
> >> > arbitrary "high enough" threshold. The intent is to dup
> >> > the most repeating variable if it it's repetition
> >> > is "high enough" and insert constants which should be "better" than
> >> > loading constant first and inserting variables like in the above case.
> >>
> >> I'm not too keen on the 80%.  Like you say, it seems a bit arbitrary.
> >>
> >> The case above can also be handled by relaxing n_var == n_elts to
> >> n_var >= n_elts - 1, so that if there's just one constant element,
> >> we look for duplicated variable elements.  If there are none
> >> (maxv == 1), but there is a constant element, we can duplicate
> >> the constant element into a register.
> >>
> >> The case when there's more than one constant element needs more thought
> >> (and testcases :-)).  E.g. after a certain point, it would probably be
> >> better to load the variable and constant parts separately and blend them
> >> using TBL.  It also matters whether the constants are equal or not.
> >>
> >> There are also cases that could be handled using EXT.
> >>
> >> Plus, if we're inserting many variable elements that are already
> >> in GPRs, we can probably do better by coalescing them into bigger
> >> GPR values and inserting them as wider elements.
> >>
> >> Because of things like that, I think we should stick to the
> >> single-constant case for now.
> > Hi Richard,
> > Thanks for the suggestions. The attached patch only handles the single
> > constant case.
> > Bootstrap+test in progress on aarch64-linux-gnu.
> > Does it look OK ?
> >
> > Thanks,
> > Prathamesh
> >>
> >> Thanks,
> >> Richard
> >
> > [aarch64] Improve code-gen for vector initialization with single constant 
> > element.
> >
> > gcc/ChangeLog:
> >   * config/aarch64/aarc64.cc (aarch64_expand_vector_init): Tweak 
> > condition
> >   if (n_var == n_elts && n_elts <= 16) to allow a single constant,
> >   and if maxv == 1, use constant element for duplicating into register.
> >
> > gcc/testsuite/ChangeLog:
> >   * gcc.target/aarch64/vec-init-single-const.c: New test.
> >
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index 2b0de7ca038..f46750133a6 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -22167,7 +22167,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >   and matches[X][1] with the count of duplicate elements (if X is the
> >   earliest element which has duplicates).  */
> >
> > -  if (n_var == n_elts && n_elts <= 16)
> > +  if ((n_var >= n_elts - 1) && n_elts <= 16)
> >  {
> >int matches[16][2] = {0};
> >for (int i = 0; i < n_elts; i++)
> > @@ -7,6 +7,18 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >  

Re: [aarch64] Code-gen for vector initialization involving constants

2023-05-01 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 25 Apr 2023 at 16:29, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > Hi Richard,
> > While digging thru aarch64_expand_vector_init, I noticed it gives
> > priority to loading a constant first:
> >  /* Initialise a vector which is part-variable.  We want to first try
> >  to build those lanes which are constant in the most efficient way we
> >  can.  */
> >
> > which results in suboptimal code-gen for following case:
> > int16x8_t f_s16(int16_t x)
> > {
> >   return (int16x8_t) { x, x, x, x, x, x, x, 1 };
> > }
> >
> > code-gen trunk:
> > f_s16:
> > moviv0.8h, 0x1
> > ins v0.h[0], w0
> > ins v0.h[1], w0
> > ins v0.h[2], w0
> > ins v0.h[3], w0
> > ins v0.h[4], w0
> > ins v0.h[5], w0
> > ins v0.h[6], w0
> > ret
> >
> > The attached patch tweaks the following condition:
> > if (n_var == n_elts && n_elts <= 16)
> >   {
> > ...
> >   }
> >
> > to pass if maxv >= 80% of n_elts, with 80% being an
> > arbitrary "high enough" threshold. The intent is to dup
> > the most repeating variable if it it's repetition
> > is "high enough" and insert constants which should be "better" than
> > loading constant first and inserting variables like in the above case.
>
> I'm not too keen on the 80%.  Like you say, it seems a bit arbitrary.
>
> The case above can also be handled by relaxing n_var == n_elts to
> n_var >= n_elts - 1, so that if there's just one constant element,
> we look for duplicated variable elements.  If there are none
> (maxv == 1), but there is a constant element, we can duplicate
> the constant element into a register.
>
> The case when there's more than one constant element needs more thought
> (and testcases :-)).  E.g. after a certain point, it would probably be
> better to load the variable and constant parts separately and blend them
> using TBL.  It also matters whether the constants are equal or not.
>
> There are also cases that could be handled using EXT.
>
> Plus, if we're inserting many variable elements that are already
> in GPRs, we can probably do better by coalescing them into bigger
> GPR values and inserting them as wider elements.
>
> Because of things like that, I think we should stick to the
> single-constant case for now.
Hi Richard,
Thanks for the suggestions. The attached patch only handles the single
constant case.
Bootstrap+test in progress on aarch64-linux-gnu.
Does it look OK ?

Thanks,
Prathamesh
>
> Thanks,
> Richard
[aarch64] Improve code-gen for vector initialization with single constant 
element.

gcc/ChangeLog:
* config/aarch64/aarc64.cc (aarch64_expand_vector_init): Tweak condition
if (n_var == n_elts && n_elts <= 16) to allow a single constant,
and if maxv == 1, use constant element for duplicating into register.

gcc/testsuite/ChangeLog:
* gcc.target/aarch64/vec-init-single-const.c: New test.

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 2b0de7ca038..f46750133a6 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -22167,7 +22167,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
  and matches[X][1] with the count of duplicate elements (if X is the
  earliest element which has duplicates).  */
 
-  if (n_var == n_elts && n_elts <= 16)
+  if ((n_var >= n_elts - 1) && n_elts <= 16)
 {
   int matches[16][2] = {0};
   for (int i = 0; i < n_elts; i++)
@@ -7,6 +7,18 @@ aarch64_expand_vector_init (rtx target, rtx vals)
 vector register.  For big-endian we want that position to hold
 the last element of VALS.  */
  maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
+
+ /* If we have a single constant element, use that for duplicating
+instead.  */
+ if (n_var == n_elts - 1)
+   for (int i = 0; i < n_elts; i++)
+ if (CONST_INT_P (XVECEXP (vals, 0, i))
+ || CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
+   {
+ maxelement = i;
+ break;
+   }
+
  rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
  aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c 
b/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c
new file mode 100644
index 000..517f47b13ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* {

Re: [match.pd] [SVE] Add pattern to transform svrev(svrev(v)) --> v

2023-04-24 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 24 Apr 2023 at 15:02, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > gcc/ChangeLog:
> >   * tree-ssa-forwprop.cc (is_combined_permutation_identity): Try to
> >   simplify two successive VEC_PERM_EXPRs with single operand and same
> >   mask, where mask chooses elements in reverse order.
> >
> > gcc/testesuite/ChangeLog:
> >   * gcc.target/aarch64/sve/acle/general/rev-1.c: New test.
> >
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/rev-1.c 
> > b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/rev-1.c
> > new file mode 100644
> > index 000..e57ee67d716
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/rev-1.c
> > @@ -0,0 +1,12 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -fdump-tree-optimized" } */
> > +
> > +#include 
> > +
> > +svint32_t f(svint32_t v)
> > +{
> > +  return svrev_s32 (svrev_s32 (v));
> > +}
> > +
> > +/* { dg-final { scan-tree-dump "return v_1\\(D\\)" "optimized" } } */
> > +/* { dg-final { scan-tree-dump-not "VEC_PERM_EXPR" "optimized" } } */
> > diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> > index 9b567440ba4..61df7efe82c 100644
> > --- a/gcc/tree-ssa-forwprop.cc
> > +++ b/gcc/tree-ssa-forwprop.cc
> > @@ -2541,6 +2541,27 @@ is_combined_permutation_identity (tree mask1, tree 
> > mask2)
> >
> >gcc_checking_assert (TREE_CODE (mask1) == VECTOR_CST
> >  && TREE_CODE (mask2) == VECTOR_CST);
> > +
> > +  /* For VLA masks, check for the following pattern:
> > + v1 = VEC_PERM_EXPR (v0, v0, mask1)
> > + v2 = VEC_PERM_EXPR (v1, v1, mask2)
>
> Maybe blank out the second operands using "...":
>
>  v1 = VEC_PERM_EXPR (v0, ..., mask1)
>  v2 = VEC_PERM_EXPR (v1, ..., mask2)
>
> to make it clear that they don't matter.
>
> OK with that change, thanks.
Thanks, committed in:
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=f0eabc52c9a2d3da0bfc201da7a5c1658b76e9a4

Thanks,
Prathamesh
>
> Richard
>
> > + -->
> > + v2 = v0
> > + if mask1 == mask2 == {nelts - 1, nelts - 2, ...}.  */
> > +
> > +  if (operand_equal_p (mask1, mask2, 0)
> > +  && !VECTOR_CST_NELTS (mask1).is_constant ())
> > +{
> > +  vec_perm_builder builder;
> > +  if (tree_to_vec_perm_builder (, mask1))
> > + {
> > +   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask1));
> > +   vec_perm_indices sel (builder, 1, nelts);
> > +   if (sel.series_p (0, 1, nelts - 1, -1))
> > + return 1;
> > + }
> > +}
> > +
> >mask = fold_ternary (VEC_PERM_EXPR, TREE_TYPE (mask1), mask1, mask1, 
> > mask2);
> >if (mask == NULL_TREE || TREE_CODE (mask) != VECTOR_CST)
> >  return 0;


Re: [match.pd] [SVE] Add pattern to transform svrev(svrev(v)) --> v

2023-04-23 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 21 Apr 2023 at 21:57, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Wed, 19 Apr 2023 at 16:17, Richard Biener  
> > wrote:
> >>
> >> On Wed, Apr 19, 2023 at 11:21 AM Prathamesh Kulkarni
> >>  wrote:
> >> >
> >> > On Tue, 11 Apr 2023 at 19:36, Prathamesh Kulkarni
> >> >  wrote:
> >> > >
> >> > > On Tue, 11 Apr 2023 at 14:17, Richard Biener 
> >> > >  wrote:
> >> > > >
> >> > > > On Wed, Apr 5, 2023 at 10:39 AM Prathamesh Kulkarni via Gcc-patches
> >> > > >  wrote:
> >> > > > >
> >> > > > > Hi,
> >> > > > > For the following test:
> >> > > > >
> >> > > > > svint32_t f(svint32_t v)
> >> > > > > {
> >> > > > >   return svrev_s32 (svrev_s32 (v));
> >> > > > > }
> >> > > > >
> >> > > > > We generate 2 rev instructions instead of nop:
> >> > > > > f:
> >> > > > > rev z0.s, z0.s
> >> > > > > rev z0.s, z0.s
> >> > > > > ret
> >> > > > >
> >> > > > > The attached patch tries to fix that by trying to recognize the 
> >> > > > > following
> >> > > > > pattern in match.pd:
> >> > > > > v1 = VEC_PERM_EXPR (v0, v0, mask)
> >> > > > > v2 = VEC_PERM_EXPR (v1, v1, mask)
> >> > > > > -->
> >> > > > > v2 = v0
> >> > > > > if mask is { nelts - 1, nelts - 2, nelts - 3, ... }
> >> > > > >
> >> > > > > Code-gen with patch:
> >> > > > > f:
> >> > > > > ret
> >> > > > >
> >> > > > > Bootstrap+test passes on aarch64-linux-gnu, and SVE bootstrap in 
> >> > > > > progress.
> >> > > > > Does it look OK for stage-1 ?
> >> > > >
> >> > > > I didn't look at the patch but 
> >> > > > tree-ssa-forwprop.cc:simplify_permutation should
> >> > > > handle two consecutive permutes with the 
> >> > > > is_combined_permutation_identity
> >> > > > which might need tweaking for VLA vectors
> >> > > Hi Richard,
> >> > > Thanks for the suggestions. The attached patch modifies
> >> > > is_combined_permutation_identity
> >> > > to recognize the above pattern.
> >> > > Does it look OK ?
> >> > > Bootstrap+test in progress on aarch64-linux-gnu and x86_64-linux-gnu.
> >> > Hi,
> >> > ping https://gcc.gnu.org/pipermail/gcc-patches/2023-April/615502.html
> >>
> >> Can you instead of def_stmt pass in a bool whether rhs1 is equal to rhs2
> >> and amend the function comment accordingly, say,
> >>
> >>   tem = VEC_PERM ;
> >>   res = VEC_PERM ;
> >>
> >> SAME_P specifies whether op0 and op1 compare equal.  */
> >>
> >> +  if (def_stmt)
> >> +gcc_checking_assert (is_gimple_assign (def_stmt)
> >> +&& gimple_assign_rhs_code (def_stmt) == 
> >> VEC_PERM_EXPR);
> >> this is then unnecessary
> >>
> >>mask = fold_ternary (VEC_PERM_EXPR, TREE_TYPE (mask1), mask1, mask1, 
> >> mask2);
> >> +
> >> +  /* For VLA masks, check for the following pattern:
> >> + v1 = VEC_PERM_EXPR (v0, v0, mask)
> >> + v2 = VEC_PERM_EXPR (v1, v1, mask)
> >> + -->
> >> + v2 = v0
> >>
> >> you are not using 'mask' so please defer fold_ternary until after your
> >> special-case.
> >>
> >> +  if (operand_equal_p (mask1, mask2, 0)
> >> +  && !VECTOR_CST_NELTS (mask1).is_constant ()
> >> +  && def_stmt
> >> +  && operand_equal_p (gimple_assign_rhs1 (def_stmt),
> >> + gimple_assign_rhs2 (def_stmt), 0))
> >> +{
> >> +  vec_perm_builder builder;
> >> +  if (tree_to_vec_perm_builder (, mask1))
> >> +   {
> >> + poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask1));
> >> + vec_perm_indices sel (builder, 1, nelts);
> >> + if (sel.series_p (0, 1, nelts - 1, -1))
>

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-04-22 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 21 Apr 2023 at 20:45, Prathamesh Kulkarni
 wrote:
>
> On Fri, 21 Apr 2023 at 14:47, Richard Sandiford
>  wrote:
> >
> > Prathamesh Kulkarni  writes:
> > > Hi,
> > > I tested the interleave+zip1 for vector init patch and it segfaulted
> > > during bootstrap while trying to build
> > > libgfortran/generated/matmul_i2.c.
> > > Rebuilding with --enable-checking=rtl showed out of bounds access in
> > > aarch64_unzip_vector_init in following hunk:
> > >
> > > +  rtvec vec = rtvec_alloc (n / 2);
> > > +  for (int i = 0; i < n; i++)
> > > +RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
> > > + : XVECEXP (vals, 0, 2 * i + 1);
> > >
> > > which is incorrect since it allocates n/2 but iterates and stores upto n.
> > > The attached patch fixes the issue, which passed bootstrap, however
> > > resulted in following fallout during testsuite run:
> > >
> > > 1] sve/acle/general/dupq_[1-4].c tests fail.
> > > For the following test:
> > > int32x4_t f(int32_t x)
> > > {
> > >   return (int32x4_t) { x, 1, 2, 3 };
> > > }
> > >
> > > Code-gen without patch:
> > > f:
> > > adrpx1, .LC0
> > > ldr q0, [x1, #:lo12:.LC0]
> > > ins v0.s[0], w0
> > > ret
> > >
> > > Code-gen with patch:
> > > f:
> > > moviv0.2s, 0x2
> > > adrpx1, .LC0
> > > ldr d1, [x1, #:lo12:.LC0]
> > > ins v0.s[0], w0
> > > zip1v0.4s, v0.4s, v1.4s
> > > ret
> > >
> > > It shows, fallback_seq_cost = 20, seq_total_cost = 16
> > > where seq_total_cost determines the cost for interleave+zip1 sequence
> > > and fallback_seq_cost is the cost for fallback sequence.
> > > Altho it shows lesser cost, I am not sure if the interleave+zip1
> > > sequence is better in this case ?
> >
> > Debugging the patch, it looks like this is because the fallback sequence
> > contains a redundant pseudo-to-pseudo move, which is costed as 1
> > instruction (4 units).  The RTL equivalent of the:
> >
> >  moviv0.2s, 0x2
> >  ins v0.s[0], w0
> >
> > has a similar redundant move, but the cost of that move is subsumed by
> > the cost of the other arm (the load from LC0), which is costed as 3
> > instructions (12 units).  So we have 12 + 4 for the parallel version
> > (correct) but 12 + 4 + 4 for the serial version (one instruction too
> > many).
> >
> > The reason we have redundant moves is that the expansion code uses
> > copy_to_mode_reg to force a value into a register.  This creates a
> > new pseudo even if the original value was already a register.
> > Using force_reg removes the moves and makes the test pass.
> >
> > So I think the first step is to use force_reg instead of
> > copy_to_mode_reg in aarch64_simd_dup_constant and
> > aarch64_expand_vector_init (as a preparatory patch).
> Thanks for the clarification!
> >
> > > 2] sve/acle/general/dupq_[5-6].c tests fail:
> > > int32x4_t f(int32_t x0, int32_t x1, int32_t x2, int32_t x3)
> > > {
> > >   return (int32x4_t) { x0, x1, x2, x3 };
> > > }
> > >
> > > code-gen without patch:
> > > f:
> > > fmovs0, w0
> > > ins v0.s[1], w1
> > > ins v0.s[2], w2
> > > ins v0.s[3], w3
> > > ret
> > >
> > > code-gen with patch:
> > > f:
> > > fmovs0, w0
> > > fmovs1, w1
> > > ins v0.s[1], w2
> > > ins v1.s[1], w3
> > > zip1v0.4s, v0.4s, v1.4s
> > > ret
> > >
> > > It shows fallback_seq_cost = 28, seq_total_cost = 16
> >
> > The zip verson still wins after the fix above, but by a lesser amount.
> > It seems like a borderline case.
> >
> > >
> > > 3] aarch64/ldp_stp_16.c's cons2_8_float test fails.
> > > Test case:
> > > void cons2_8_float(float *x, float val0, float val1)
> > > {
> > > #pragma GCC unroll(8)
> > >   for (int i = 0; i < 8 * 2; i += 2) {
> > > x[i + 0] = val0;
> > > x[i + 1] = val1;
> > >   }
> > > }
> > >
> > > which is lowered to:
> > > void cons2_8_float (float * x, float val0, float val1)
> > > {
> > &g

Re: [aarch64] Use force_reg instead of copy_to_mode_reg

2023-04-21 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 21 Apr 2023 at 21:00, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > Hi Richard,
> > Based on your suggestions in the other thread, the patch uses force_reg
> > to avoid creating pseudo if value is already in a register.
> > Bootstrap+test passes on aarch64-linux-gnu.
> > OK to commit ?
> >
> > Thanks,
> > Prathamesh
> >
> > [aarch64] Use force_reg instead of copy_to_mode_reg.
> >
> > Use force_reg instead of copy_to_mode_reg in aarch64_simd_dup_constant
> > and aarch64_expand_vector_init to avoid creating pseudo if original value
> > is already in a register.
> >
> > gcc/ChangeLog:
> >   * config/aarch64/aarch64.cc (aarch64_simd_dup_constant): Use
> >   force_reg instead of copy_to_mode_reg.
> >   (aarch64_expand_vector_init): Likewise.
>
> OK, thanks.
Thanks, committed in:
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=e306501ff556647dc31915a63ce95a5496f08f97

Thanks,
Prathamesh
>
> Richard
>
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index 0d7470c05a1..321580d7f6a 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -21968,7 +21968,7 @@ aarch64_simd_dup_constant (rtx vals)
> >/* We can load this constant by using DUP and a constant in a
> >   single ARM register.  This will be cheaper than a vector
> >   load.  */
> > -  x = copy_to_mode_reg (inner_mode, x);
> > +  x = force_reg (inner_mode, x);
> >return gen_vec_duplicate (mode, x);
> >  }
> >
> > @@ -22082,7 +22082,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >/* Splat a single non-constant element if we can.  */
> >if (all_same)
> >  {
> > -  rtx x = copy_to_mode_reg (inner_mode, v0);
> > +  rtx x = force_reg (inner_mode, v0);
> >aarch64_emit_move (target, gen_vec_duplicate (mode, x));
> >return;
> >  }
> > @@ -22190,12 +22190,12 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >vector register.  For big-endian we want that position to hold
> >the last element of VALS.  */
> > maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
> > -   rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 
> > maxelement));
> > +   rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
> > aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
> >   }
> >else
> >   {
> > -   rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 
> > maxelement));
> > +   rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
> > aarch64_emit_move (target, gen_vec_duplicate (mode, x));
> >   }
> >
> > @@ -22205,7 +22205,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> > rtx x = XVECEXP (vals, 0, i);
> > if (matches[i][0] == maxelement)
> >   continue;
> > -   x = copy_to_mode_reg (inner_mode, x);
> > +   x = force_reg (inner_mode, x);
> > emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
> >   }
> >return;
> > @@ -22249,7 +22249,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >rtx x = XVECEXP (vals, 0, i);
> >if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
> >   continue;
> > -  x = copy_to_mode_reg (inner_mode, x);
> > +  x = force_reg (inner_mode, x);
> >emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
> >  }
> >  }


  1   2   3   4   5   6   7   8   9   10   >