Re: [PATCH] c++: Fix folding of non-dependent BASELINKs [PR95468]

2021-02-13 Thread Patrick Palka via Gcc-patches
On Fri, 12 Feb 2021, Patrick Palka wrote:

> Here, the problem ultimately seems to be that tsubst_copy_and_build,
> when called with empty args as we do during non-dependent expression
> folding, doesn't touch BASELINKs at all: it delegates to tsubst_copy
> which then immediately exits early due to the empty args.  This means
> that the CAST_EXPR int(1) in the BASELINK A::condition never
> gets folded (as part of folding of the overall CALL_EXPR), which later
> causes us to crash when performing overload resolution of the rebuilt
> CALL_EXPR (which is in terms of this still-templated BASELINK).
> 
> This doesn't happen when condition() is a namespace-scope function
> because then condition is represented as a TEMPLATE_ID_EXPR
> rather than a BASELINK, which does get handled directly from
> tsubst_copy_and_build.
> 
> This patch fixes this issue by having tsubst_copy_and_build handle
> BASELINK directly rather than delegating to tsubst_copy, so that it
> processes BASELINKS even when args is empty.
> 
> Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
> trunk?
> 
> gcc/cp/ChangeLog:
> 
>   PR c++/95468
>   * pt.c (tsubst_copy_and_build) : New case, copied
>   over from tsubst_copy.
> 
> gcc/testsuite/ChangeLog:
> 
>   PR c++/95468
>   * g++.dg/template/non-dependent15.C: New test.
> ---
>  gcc/cp/pt.c |  5 +
>  gcc/testsuite/g++.dg/template/non-dependent15.C | 12 
>  2 files changed, 17 insertions(+)
>  create mode 100644 gcc/testsuite/g++.dg/template/non-dependent15.C
> 
> diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c
> index 5102bf02d0f..5b2f43dc5c1 100644
> --- a/gcc/cp/pt.c
> +++ b/gcc/cp/pt.c
> @@ -19856,6 +19856,11 @@ tsubst_copy_and_build (tree t,
>  case SCOPE_REF:
>RETURN (tsubst_qualified_id (t, args, complain, in_decl, /*done=*/true,
> /*address_p=*/false));
> +
> +case BASELINK:
> +  return tsubst_baselink (t, current_nonlambda_class_type (),
> +   args, complain, in_decl);

Oops, this should use the RETURN macro instead of a bare 'return' so
that input_location gets properly restored on function exit.

And it looks like there's an existing instance of this bug in the
LAMBDA_EXPR case of tsubst_copy_and_build.  Perhaps it's a good time to
replace these RETURN macros with the equivalent use of iloc_sentinel,
like so?  Bootstrapped and tested on x86_64-pc-linux-gnu.

-- >8 --

Subject: [PATCH] c++: Replace RETURN macros with iloc_sentinel

This replaces the C-era RETURN macro idiom used by some of the tsubsting
functions with an iloc_sentinel declared at the start of each function.

gcc/cp/ChangeLog:

* pt.c (tsubst_decl): Delete the RETURN macro, and replace its
uses with a plain 'return'.  Set up an iloc_sentinel at the
start of the function to set and restore input_location.
(tsubst_expr): Likewise.  Remove redundant break statements that
immediately follow a return.
(tsubst_copy_and_build): Likewise.  Remove 'retval' local
variable.  Add gcc_unreachable to the end of the function.
---
 gcc/cp/pt.c | 316 
 1 file changed, 145 insertions(+), 171 deletions(-)

diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c
index 5b2f43dc5c1..32d1759258c 100644
--- a/gcc/cp/pt.c
+++ b/gcc/cp/pt.c
@@ -14414,15 +14414,12 @@ enclosing_instantiation_of (tree otctx)
 static tree
 tsubst_decl (tree t, tree args, tsubst_flags_t complain)
 {
-#define RETURN(EXP) do { r = (EXP); goto out; } while(0)
-  location_t saved_loc;
   tree r = NULL_TREE;
   tree in_decl = t;
   hashval_t hash = 0;
 
-  /* Set the filename and linenumber to improve error-reporting.  */
-  saved_loc = input_location;
-  input_location = DECL_SOURCE_LOCATION (t);
+  /* Set the source position to improve error-reporting.  */
+  iloc_sentinel ils (DECL_SOURCE_LOCATION (t));
 
   switch (TREE_CODE (t))
 {
@@ -14453,7 +14450,7 @@ tsubst_decl (tree t, tree args, tsubst_flags_t complain)
 if (spec 
 && TREE_CODE (spec) == PARM_DECL
 && TREE_CODE (TREE_TYPE (spec)) != TYPE_PACK_EXPANSION)
-  RETURN (spec);
+ return spec;
 
 /* Expand the TYPE_PACK_EXPANSION that provides the types for
the parameters in this function parameter pack.  */
@@ -14466,8 +14463,8 @@ tsubst_decl (tree t, tree args, tsubst_flags_t complain)
 /* Zero-length parameter packs are boring. Just substitute
into the chain.  */
if (len == 0 && !cp_unevaluated_operand)
-  RETURN (tsubst (TREE_CHAIN (t), args, complain,
- TREE_CHAIN (t)));
+ return tsubst (TREE_CHAIN (t), args, complain,
+TREE_CHAIN (t));
   }
 else
   {
@@ -14587,7 +14584,7 @@ 

[PATCH] libiberty: autogenerate aclocal.m4

2021-02-13 Thread Mike Frysinger via Gcc-patches
Move custom macros to acinclude.m4 so we can autogenerate aclocal.m4
with aclocal.  This matches every other project in the tree.

libiberty/ChangeLog:

* Makefile.in (ACLOCAL, ACLOCAL_AMFLAGS, $(srcdir)/aclocal.m4): Define.
(configure_deps): Rename to ...
(aclocal_deps): ... this.  Replace aclocal.m4 with acinclude.m4.
($(srcdir)/configure): Replace $(configure_deps) with
$(srcdir)/aclocal.m4.
* aclocal.m4: Move libiberty macros to acinclude.m4, then regenerate.
* acinclude.m4: New file.
* configure: Regenerate.
---
 libiberty/Makefile.in  |  12 ++-
 libiberty/acinclude.m4 | 185 ++
 libiberty/aclocal.m4   | 198 +
 libiberty/configure|   3 -
 4 files changed, 215 insertions(+), 183 deletions(-)
 create mode 100644 libiberty/acinclude.m4

diff --git a/libiberty/Makefile.in b/libiberty/Makefile.in
index 788590957e17..4f1213b983b6 100644
--- a/libiberty/Makefile.in
+++ b/libiberty/Makefile.in
@@ -481,18 +481,24 @@ config.status: $(srcdir)/configure
$(SHELL) ./config.status --recheck
 
 AUTOCONF = autoconf
-configure_deps = $(srcdir)/aclocal.m4 \
+ACLOCAL = aclocal
+ACLOCAL_AMFLAGS = -I ../config -I ..
+aclocal_deps = \
$(srcdir)/../config/acx.m4 \
$(srcdir)/../config/cet.m4 \
$(srcdir)/../config/enable.m4 \
$(srcdir)/../config/no-executables.m4 \
$(srcdir)/../config/override.m4 \
$(srcdir)/../config/picflag.m4 \
-   $(srcdir)/../config/warnings.m4
+   $(srcdir)/../config/warnings.m4 \
+   $(srcdir)/acinclude.m4
 
-$(srcdir)/configure: @MAINT@ $(srcdir)/configure.ac $(configure_deps)
+$(srcdir)/configure: @MAINT@ $(srcdir)/configure.ac $(srcdir)/aclocal.m4
cd $(srcdir) && $(AUTOCONF)
 
+$(srcdir)/aclocal.m4: @MAINT@ $(aclocal_deps)
+   cd $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS)
+
 # Depending on config.h makes sure that config.status has been re-run
 # if needed.  This prevents problems with parallel builds, in case
 # subdirectories need to run config.status also.
diff --git a/libiberty/acinclude.m4 b/libiberty/acinclude.m4
new file mode 100644
index ..6db0e5085171
--- /dev/null
+++ b/libiberty/acinclude.m4
@@ -0,0 +1,185 @@
+dnl Copyright (C) 2000-2021 Free Software Foundation, Inc.
+dnl
+dnl GCC is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU General Public License as published by
+dnl the Free Software Foundation; either version 3, or (at your option)
+dnl any later version.
+dnl
+dnl GCC is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+dnl GNU General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU General Public License
+dnl along with GCC; see the file COPYING3.  If not see
+dnl .
+
+dnl See whether strncmp reads past the end of its string parameters.
+dnl On some versions of SunOS4 at least, strncmp reads a word at a time
+dnl but erroneously reads past the end of strings.  This can cause
+dnl a SEGV in some cases.
+AC_DEFUN([libiberty_AC_FUNC_STRNCMP],
+[AC_REQUIRE([AC_FUNC_MMAP])
+AC_CACHE_CHECK([for working strncmp], ac_cv_func_strncmp_works,
+[AC_TRY_RUN([
+/* Test by Jim Wilson and Kaveh Ghazi.
+   Check whether strncmp reads past the end of its string parameters. */
+#include 
+
+#ifdef HAVE_FCNTL_H
+#include 
+#endif
+
+#ifdef HAVE_SYS_MMAN_H
+#include 
+#endif
+
+#ifndef MAP_ANON
+#ifdef MAP_ANONYMOUS
+#define MAP_ANON MAP_ANONYMOUS
+#else
+#define MAP_ANON MAP_FILE
+#endif
+#endif
+
+#ifndef MAP_FILE
+#define MAP_FILE 0
+#endif
+#ifndef O_RDONLY
+#define O_RDONLY 0
+#endif
+
+#define MAP_LEN 0x1
+
+main ()
+{
+#if defined(HAVE_MMAP) || defined(HAVE_MMAP_ANYWHERE)
+  char *p;
+  int dev_zero;
+
+  dev_zero = open ("/dev/zero", O_RDONLY);
+  if (dev_zero < 0)
+exit (1);
+
+  p = (char *) mmap (0, MAP_LEN, PROT_READ|PROT_WRITE,
+MAP_ANON|MAP_PRIVATE, dev_zero, 0);
+  if (p == (char *)-1)
+p = (char *) mmap (0, MAP_LEN, PROT_READ|PROT_WRITE,
+  MAP_ANON|MAP_PRIVATE, -1, 0);
+  if (p == (char *)-1)
+exit (2);
+  else
+{
+  char *string = "__si_type_info";
+  char *q = (char *) p + MAP_LEN - strlen (string) - 2;
+  char *r = (char *) p + 0xe;
+
+  strcpy (q, string);
+  strcpy (r, string);
+  strncmp (r, q, 14);
+}
+#endif /* HAVE_MMAP || HAVE_MMAP_ANYWHERE */
+  exit (0);
+}
+], ac_cv_func_strncmp_works=yes, ac_cv_func_strncmp_works=no,
+  ac_cv_func_strncmp_works=yes)
+rm -f core core.* *.core])
+if test $ac_cv_func_strncmp_works = no ; then
+  AC_LIBOBJ([strncmp])
+fi
+])
+
+dnl See if errno must be declared even when  is included.
+AC_DEFUN([libiberty_AC_DECLARE_ERRNO],
+[AC_CACHE_CHECK(whether errno must be declared, 

[PATCH] handle bad __dynamic_cast more gracefully (PR 99074)

2021-02-13 Thread Martin Sebor via Gcc-patches

The test case in PR 99074 invokes dynamic_cast with the this pointer
in a non-static member function called on a null pointer.  The call
is, of course, undefined and other different circumstances would be
diagnosed by -Wnonnull.   Unfortunately, in the test case, the null
pointer is the result of inlining and constant propagation and so
detected neither by the front end -Wnonnull nor by the middle end.
The program ends up passing it to __dynamic_cast() which then
crashes at runtime (again, not surprising for undefined behavior.

However, the reporter says the code behaved gracefully (didn't crash)
when compiled with GCC 4.8, and in my tests it also doesn't crash
when compiled with Clang or ICC.  I looked to see if it's possible
to do better and it seems it is.

The attached patch improves things by changing __dynamic_cast to
fail by returning null when the first argument is null, and also
by declaring __dynamic_cast with attribute nonnull so that invalid
calls to it with a constant null pointer can be detected at compile
time.

Since the test case is undefined it seems borderline whether this
can strictly be considered a regression, even if some previous
releases handled it more gracefully.  I post it for consideration
either now or for GCC 12.

Tested on x86_64-linux.

Martin
PR c++/99074 - crash in dynamic_cast<>() on null pointer

gcc/cp/ChangeLog:

	PR c++/99074
	* rtti.c (build_dynamic_cast_1): Declare nonnull.

libstdc++-v3/ChangeLog:

	PR c++/99074
	* libsupc++/dyncast.cc (__dynamic_cast): Return null when
	first argument is null.

gcc/testsuite/ChangeLog:

	PR c++/99074
	* g++.dg/warn/Wnonnull11.C: New test.

diff --git a/gcc/cp/rtti.c b/gcc/cp/rtti.c
index b41d95469c6..b78b5f2dab4 100644
--- a/gcc/cp/rtti.c
+++ b/gcc/cp/rtti.c
@@ -30,6 +30,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "stor-layout.h"
 #include "c-family/c-pragma.h"
 #include "gcc-rich-location.h"
+#include "attribs.h"
 
 /* C++ returns type information to the user in struct type_info
objects. We also use type information to implement dynamic_cast and
@@ -767,6 +768,11 @@ build_dynamic_cast_1 (location_t loc, tree type, tree expr,
 	  dcast_fn = (build_library_fn_ptr
 			  (fn_name, fn_type, ECF_LEAF | ECF_PURE | ECF_NOTHROW));
 	  pop_abi_namespace (flags);
+
+	  /* __dynamic_cast expects all nonnull pointers.  */
+	  tree attr = tree_cons (get_identifier ("nonnull"),
+ NULL_TREE, NULL_TREE);
+	  decl_attributes (_fn, attr, 0);
 	  dynamic_cast_node = dcast_fn;
 	}
 	  result = build_cxx_call (dcast_fn, 4, elems, complain);
diff --git a/gcc/testsuite/g++.dg/warn/Wnonnull11.C b/gcc/testsuite/g++.dg/warn/Wnonnull11.C
new file mode 100644
index 000..1dcd820356a
--- /dev/null
+++ b/gcc/testsuite/g++.dg/warn/Wnonnull11.C
@@ -0,0 +1,40 @@
+/* PR c++/99074 - gcc 8 and above is crashing with dynamic_cast<>() on null
+   pointer with optimization level -O1 and above
+   { dg-do run }
+   { dg-options "-O1 -Wall" } */
+
+class Base
+{
+public:
+  virtual ~Base() {}
+  virtual void op() = 0;
+};
+
+class Object: public virtual Base { };
+
+class AbstractBase: public virtual Base
+{
+public:
+  Object* _to_object ()
+  {
+return dynamic_cast(this);   // { dg-warning "\\\[-Wnonnull" }
+  }
+};
+
+class MyAbstractClass: public virtual AbstractBase
+{
+public:
+  static MyAbstractClass* _nil () { return 0; }
+};
+
+
+int main ()
+{
+  MyAbstractClass *my_abs_type = MyAbstractClass::_nil ();
+  AbstractBase *abs_base = my_abs_type;
+  Object *obj = abs_base->_to_object ();
+
+  __builtin_printf ("object is: %p\n", obj);
+
+  return 0;
+}
diff --git a/libstdc++-v3/libsupc++/dyncast.cc b/libstdc++-v3/libsupc++/dyncast.cc
index b7d98495ad3..f8f707ee4d4 100644
--- a/libstdc++-v3/libsupc++/dyncast.cc
+++ b/libstdc++-v3/libsupc++/dyncast.cc
@@ -47,6 +47,9 @@ __dynamic_cast (const void *src_ptr,// object started from
 const __class_type_info *dst_type, // desired target type
 ptrdiff_t src2dst) // how src and dst are related
   {
+  if (!src_ptr)
+/* Handle precondition violations gracefully.  */
+return NULL;
   const void *vtable = *static_cast  (src_ptr);
   const vtable_prefix *prefix =
 (adjust_pointer 


[Patch] Fortran: Fix ubound simplifcation [PR99027]

2021-02-13 Thread Tobias Burnus

The problem which accessing dim= for an expression is that
when the argument is an array, dim= and expr->rank are relative
to the resulting array – but the array-ref also contains the
DIMEN_ELEMENT which does not count for dim=/rank.

OK for the trunk? (Reported against GCC 11, but I wonder whether
we should still apply it to GCC 10.)

Tobias

Fortran: Fix ubound simplifcation [PR99027]

gcc/fortran/ChangeLog:

	PR fortran/99027
	* simplify.c (simplify_bound_dim): Honor DIMEN_ELEMENT
	when using dim=.

gcc/testsuite/ChangeLog:

	PR fortran/99027
	* gfortran.dg/ubound_1.f90: New test.

 gcc/fortran/simplify.c | 12 +++-
 gcc/testsuite/gfortran.dg/ubound_1.f90 | 22 ++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/gcc/fortran/simplify.c b/gcc/fortran/simplify.c
index 23317a2e2d9..35f267db588 100644
--- a/gcc/fortran/simplify.c
+++ b/gcc/fortran/simplify.c
@@ -4168,7 +4168,17 @@ simplify_bound_dim (gfc_expr *array, gfc_expr *kind, int d, int upper,
 {
   if (upper)
 	{
-	  if (!gfc_ref_dimen_size (>u.ar, d - 1, >value.integer, NULL))
+	  int d2 = 0, cnt = 0;
+	  for (int idx = 0; idx < ref->u.ar.dimen; ++idx)
+	{
+	  if (ref->u.ar.dimen_type[idx] == DIMEN_ELEMENT)
+		d2++;
+	  else if (cnt < d - 1)
+		cnt++;
+	  else
+		break;
+	}
+	  if (!gfc_ref_dimen_size (>u.ar, d2 + d - 1, >value.integer, NULL))
 	goto returnNull;
 	}
   else
diff --git a/gcc/testsuite/gfortran.dg/ubound_1.f90 b/gcc/testsuite/gfortran.dg/ubound_1.f90
new file mode 100644
index 000..7c588b026e1
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/ubound_1.f90
@@ -0,0 +1,22 @@
+! { dg-do run }
+!
+! PR fortran/99027
+!
+  program test
+integer, dimension (1:3,1:6) :: array
+integer, dimension (2:5,3:7,4:9,-4:2) :: array2
+
+if (any ([4] /= ubound (array (1, 1:4 stop 1
+if (4 /= ubound (array (1, 1:4), dim=1)) stop 2
+
+if (any (ubound (array2 (3,3,4,:))/= [4+1+2])) stop 3
+if ( ubound (array2 (3,3,4,:), dim=1) /=  4+1+2 ) stop 4
+
+if (any (ubound (array2 (3,:,4,:))/= [7-3+1, 4+1+2])) stop 5
+if ( ubound (array2 (3,:,4,:), dim=1) /=  7-3+1   ) stop 6
+if ( ubound (array2 (3,:,4,:), dim=2) /= 4+1+2) stop 7
+if (any (ubound (array2 (3,:,4:4,:))/= [7-3+1, 1, 4+1+2])) stop 8
+if ( ubound (array2 (3,:,4:4,:), dim=1) /=  7-3+1  ) stop 9
+if ( ubound (array2 (3,:,4:4,:), dim=2) /= 1   ) stop 10
+if ( ubound (array2 (3,:,4:4,:), dim=3) /=4+1+2) stop 11
+  end program test


[PATCH] RISC-V: Avoid zero/sign extend for volatile loads. Fix for 97417.

2021-02-13 Thread Jim Wilson
From: Levy Hsu 

This expands sub-word loads as a zero/sign extended load, followed by
a subreg.  This helps eliminate unnecessary zero/sign extend insns after
the load, particularly for volatiles, but also in some other cases.
Testing shows that it gives consistent code size decreases.

Tested with riscv32-elf rv32imac/ilp32 and riscv64-linux rv64gc/lp064d
builds and checks.  Some -gsplit-stack tests fail with the patch, but
this turns out to be an existing bug with the split-stack support that
I hadn't noticed before.  It isn't a bug in this patch.  Ignoring that
there are no regressions.

Committed.

gcc/
PR target/97417
* config/riscv/riscv-shorten-memrefs.c (pass_shorten_memrefs): Add
extend parameter to get_si_mem_base_reg declaration.
(get_si_mem_base_reg): Add extend parameter.  Set it.
(analyze): Pass extend arg to get_si_mem_base_reg.
(transform): Likewise.  Use it when rewriting mems.
* config/riscv/riscv.c (riscv_legitimize_move): Check for subword
loads and emit sign/zero extending load followed by subreg move.
---
 gcc/config/riscv/riscv-shorten-memrefs.c | 34 +++-
 gcc/config/riscv/riscv.c | 22 +++
 2 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/gcc/config/riscv/riscv-shorten-memrefs.c 
b/gcc/config/riscv/riscv-shorten-memrefs.c
index b1b57f1b5e0..3f34065c1ce 100644
--- a/gcc/config/riscv/riscv-shorten-memrefs.c
+++ b/gcc/config/riscv/riscv-shorten-memrefs.c
@@ -75,12 +75,19 @@ private:
 
   regno_map * analyze (basic_block bb);
   void transform (regno_map *m, basic_block bb);
-  bool get_si_mem_base_reg (rtx mem, rtx *addr);
+  bool get_si_mem_base_reg (rtx mem, rtx *addr, bool *extend);
 }; // class pass_shorten_memrefs
 
 bool
-pass_shorten_memrefs::get_si_mem_base_reg (rtx mem, rtx *addr)
+pass_shorten_memrefs::get_si_mem_base_reg (rtx mem, rtx *addr, bool *extend)
 {
+  /* Whether it's sign/zero extended.  */
+  if (GET_CODE (mem) == ZERO_EXTEND || GET_CODE (mem) == SIGN_EXTEND)
+{
+  *extend = true;
+  mem = XEXP (mem, 0);
+}
+
   if (!MEM_P (mem) || GET_MODE (mem) != SImode)
 return false;
   *addr = XEXP (mem, 0);
@@ -110,7 +117,8 @@ pass_shorten_memrefs::analyze (basic_block bb)
{
  rtx mem = XEXP (pat, i);
  rtx addr;
- if (get_si_mem_base_reg (mem, ))
+ bool extend = false;
+ if (get_si_mem_base_reg (mem, , ))
{
  HOST_WIDE_INT regno = REGNO (XEXP (addr, 0));
  /* Do not count store zero as these cannot be compressed.  */
@@ -150,7 +158,8 @@ pass_shorten_memrefs::transform (regno_map *m, basic_block 
bb)
{
  rtx mem = XEXP (pat, i);
  rtx addr;
- if (get_si_mem_base_reg (mem, ))
+ bool extend = false;
+ if (get_si_mem_base_reg (mem, , ))
{
  HOST_WIDE_INT regno = REGNO (XEXP (addr, 0));
  /* Do not transform store zero as these cannot be compressed.  */
@@ -161,9 +170,20 @@ pass_shorten_memrefs::transform (regno_map *m, basic_block 
bb)
}
  if (m->get_or_insert (regno) > 3)
{
- addr
-   = targetm.legitimize_address (addr, addr, GET_MODE (mem));
- XEXP (pat, i) = replace_equiv_address (mem, addr);
+ if (extend)
+   {
+ addr
+   = targetm.legitimize_address (addr, addr,
+ GET_MODE (XEXP (mem, 0)));
+ XEXP (XEXP (pat, i), 0)
+   = replace_equiv_address (XEXP (mem, 0), addr);
+   }
+ else
+   {
+ addr = targetm.legitimize_address (addr, addr,
+GET_MODE (mem));
+ XEXP (pat, i) = replace_equiv_address (mem, addr);
+   }
  df_insn_rescan (insn);
}
}
diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c
index 7d274596ba3..fffd0814eee 100644
--- a/gcc/config/riscv/riscv.c
+++ b/gcc/config/riscv/riscv.c
@@ -1524,6 +1524,28 @@ riscv_legitimize_const_move (machine_mode mode, rtx 
dest, rtx src)
 bool
 riscv_legitimize_move (machine_mode mode, rtx dest, rtx src)
 {
+  /* Expand 
+   (set (reg:QI target) (mem:QI (address))) 
+ to
+   (set (reg:DI temp) (zero_extend:DI (mem:QI (address
+   (set (reg:QI target) (subreg:QI (reg:DI temp) 0))
+ with auto-sign/zero extend.  */
+  if (GET_MODE_CLASS (mode) == MODE_INT
+  && GET_MODE_SIZE (mode) < UNITS_PER_WORD
+  && can_create_pseudo_p ()
+  && MEM_P (src))
+{
+  rtx temp_reg;
+  int zero_extend_p;
+
+  temp_reg = gen_reg_rtx (word_mode);
+  zero_extend_p = (LOAD_EXTEND_OP (mode) == ZERO_EXTEND);
+  emit_insn 

[PATCH] RISC-V: Shorten memrefs improvement, partial fix 97417.

2021-02-13 Thread Jim Wilson
We already have a check for riscv_shorten_memrefs in riscv_address_cost.
This adds the same check to riscv_rtx_costs.  Making this work also
requires a change to riscv_compressed_lw_address_p to work before reload
by checking the offset and assuming any pseudo reg is OK.  Testing shows
that this consistently gives small code size reductions.

Tested with riscv32-elf rv32imac/ilp32 and riscv64-linux rv64gc/lp64d
builds and checks and there were no regressions.

Committed.

gcc/
PR target/97417
* config/riscv/riscv.c (riscv_compressed_lw_address_p): Drop early
exit when !reload_completed.  Only perform check for compressed reg
if reload_completed.
(riscv_rtx_costs): In MEM case, when optimizing for size and
shorten memrefs, if not compressible, then increase cost.
---
 gcc/config/riscv/riscv.c | 19 +++
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c
index ff41795a031..7d274596ba3 100644
--- a/gcc/config/riscv/riscv.c
+++ b/gcc/config/riscv/riscv.c
@@ -891,17 +891,13 @@ riscv_compressed_lw_address_p (rtx x)
   bool result = riscv_classify_address (, x, GET_MODE (x),
reload_completed);
 
-  /* Before reload, assuming all load/stores of valid addresses get compressed
- gives better code size than checking if the address is reg + small_offset
- early on.  */
-  if (result && !reload_completed)
-return true;
-
   /* Return false if address is not compressed_reg + small_offset.  */
   if (!result
   || addr.type != ADDRESS_REG
-  || (!riscv_compressed_reg_p (REGNO (addr.reg))
-   && addr.reg != stack_pointer_rtx)
+  /* Before reload, assume all registers are OK.  */
+  || (reload_completed
+ && !riscv_compressed_reg_p (REGNO (addr.reg))
+ && addr.reg != stack_pointer_rtx)
   || !riscv_compressed_lw_offset_p (addr.offset))
 return false;
 
@@ -1708,6 +1704,13 @@ riscv_rtx_costs (rtx x, machine_mode mode, int 
outer_code, int opno ATTRIBUTE_UN
 instructions it needs.  */
   if ((cost = riscv_address_insns (XEXP (x, 0), mode, true)) > 0)
{
+ /* When optimizing for size, make uncompressible 32-bit addresses
+more expensive so that compressible 32-bit addresses are
+preferred.  */
+ if (TARGET_RVC && !speed && riscv_mshorten_memrefs && mode == SImode
+ && !riscv_compressed_lw_address_p (XEXP (x, 0)))
+   cost++;
+
  *total = COSTS_N_INSNS (cost + tune_param->memory_cost);
  return true;
}
-- 
2.17.1



Re: [Patch] Fortran: Fix coarray handling for gfc_dep_resolver [PR99010] (was: Re: [Patch] Fortran: Fix Array dependency with local coarrays [PR98913]

2021-02-13 Thread Tobias Burnus

*PIN*

On 09.02.21 12:52, Tobias Burnus wrote:

Hi all, hi Thomas,

On 02.02.21 18:15, Tobias Burnus wrote:

I think I will do a combination: If 'identical' is true, I think I
cannot remove it. If it is false, it can be identical or nonoverlapping
– which makes sense.


Well, it turned out that coarray scalars did not work; for them,
the array ref consists only of the coindexed access: falling through
then triggered as fin_dep == GFC_DEP_ERROR.

To be more careful, I also removed the 'identical &&' check such that
the first loop is now always triggered if coarray != SINGLE not only
if identical – I am not sure whether it is really needed or whether
falling though is the better solution (with updating the comment).

I would be happy if someone could carefully check the logic –
in the details, it is rather confusing.

OK?

Tobias

-
Mentor Graphics (Deutschland) GmbH, Arnulfstrasse 201, 80634 München 
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, 
Frank Thürauf


Re: Patch for PR analyzer/98797

2021-02-13 Thread brian.sobulefsky via Gcc-patches
Hi, answers below. Note, do you want my resubmitted patch to be with
commit --amend, and therefore relative to "real" commits in the history,
or do you want me to chain it onto the last submission?

I have already sent to assign at gnu for the form.

I will update the commit message to call it s3 now.

The case did not arise "in practice." After solving the xfail from the
test suite, I tried dividing the array to have more than one field and
found that this still did not work, so I tracked down why. I would
argue that the case of a struct made of only a single array as its one
field is a lot less likely to arise in practice. For what its worth,
the single field example basically works "for the wrong
reason."
When get_binding_recursive goes up the parent chain, it finds
a binding
in the parent cast region because that cast is a struct made of
only one
field, and so its binding_key is the same as that field. The
"right"
logic would check for a field covering the desired region.

I'll fix the ChangeLog formatting parentheses. I found the formatting
script, but I did not have one of the imported python files so I just copied 
others.

I can add the helper routine you have requested. Where would you like it,
I am thinking analyzer.h so it is visible basically everywhere?

I can add the check for size matching, which helper routine is easiest.

Yes, in the event of *p == 'A', *p, having a constant zerop offset
would get sent to get_cast_region, where it would become a cast region.
This defeats the new logic in maybe_fold_sub_svalue. I think in the case
where we are pointing to an array of the requested type, it is much more
correct to return the offset region with a zero offset, as arr[0] should not
be different than arr[1]. Sometimes the 0 tree would be of pointer_type,
and I was not sure if this would defeat it or not, so I made sure it was
of integer_type. This may just be a matter of my being new and not
knowing for sure how everything works and so erring on the side of safety.

As of now, the routine immediately rejects any case other than where
reg is a field_region and parent is a cast_region. I will think if there
is a C like syntax for the function, really it is checking if the original
form of parent had a field covering the requested field.

I will remove the first guard, leave the second, and try to reformat
the latters into a similar rejection style.

Currently, get_field_at_bit_offset is not an externally visible function.
I am taking your instruction to reuse it to mean a prototype should
be added to the relevant header (it would be region.h I think, as long as
both region-model-manager.cc and store.cc include those).

As you know, I am very new to gcc and so was happy when I could hack my
way to this working at all. I already assumed my direct access was not
correct. Most of what I did is based on "RTFS" since the manual does not
really cover the right way to do these things. I will try the routine
or otherwise macro you say.

I will change the testfile to leave pre exisiting lines in place.


Sent with ProtonMail Secure Email.

‐‐‐ Original Message ‐‐‐
On Friday, February 12, 2021 4:16 PM, David Malcolm  wrote:

> On Wed, 2021-02-10 at 19:42 +, brian.sobulefsky wrote:
>
> Hi Brian
>
> Thanks for the patch.
>
> The patch is large enough to count as legally significant; I've sent
> you information on copyright assignment separately; the patch can't be
> committed until that paperwork is in place.
>
> In the meantime, I've added some review notes inline below throughout:
>
> > Address the bug found in test file 
> > gcc/testsuite/gcc.dg/analyzer/casts-1.c, as
> > recorded by the XFAIL, and some simpler and more complex versions found 
> > during
> > the investigation of it. PR analyzer/98797 was opened for these bugs.
> >
> > The simplest manifestation of this bug can be replicated with:
> >
> > char arr[] = {'A', 'B', 'C', 'D'};
> > char *pt = ar;
> > __analyzer_eval(*(pt + 0) == 'A');
> > __analyzer_eval(*(pt + 1) == 'B');
> > //etc
> >
> > The result of the eval call is "UNKNOWN" because the analyzer is unable 
> > to
> > determine the value at the pointer. Note that array access (such as 
> > arr[0]) is
> > correctly handled. The code responsible for this is in file
> > region-model-manager.cc, function 
> > region_model_manager::maybe_fold_sub_svalue.
> > The relevant section is commented /* Handle getting individual chars 
> > from
> > STRING_CST */. This section only had a case for an element_region. A 
> > case
> > needed to be added for an offset_region.
> >
> > Additionally, when the offset was 0, such as in *pt or *(pt + 0), the 
> > function
> > region_model_manager::get_offset_region was failing to make the needed 
> > offset
> > region at all. This was due to the test for a constant 0 pointer that 
> > was then
> > returning get_cast_region. A special case is needed for when PARENT is 
> > of 

Re: [PATCH] passes: Enable split4 with selective scheduling 2 [PR98439]

2021-02-13 Thread Richard Biener
On February 13, 2021 4:07:03 PM GMT+01:00, Jakub Jelinek  
wrote:
>On Sat, Feb 13, 2021 at 02:54:38PM +0100, Richard Biener wrote:
>> Ok.  But if required splitting is an IL property maybe we can see sth
>like
>> RTL_split_insns, clear it from passes like selsched and gate split on
>the
>> property and not being present?
>
>I guess better would be to track it after reload during recog (if we
>recog
>an instruction that requires splitting set a flag/property that
>splitting will be
>needed).  If it is worth the effort that is.

Yeah, not sure. 

Richard. 

>   Jakub



Re: [PATCH] passes: Enable split4 with selective scheduling 2 [PR98439]

2021-02-13 Thread Jakub Jelinek via Gcc-patches
On Sat, Feb 13, 2021 at 02:54:38PM +0100, Richard Biener wrote:
> Ok.  But if required splitting is an IL property maybe we can see sth like
> RTL_split_insns, clear it from passes like selsched and gate split on the
> property and not being present?

I guess better would be to track it after reload during recog (if we recog
an instruction that requires splitting set a flag/property that splitting will 
be
needed).  If it is worth the effort that is.

Jakub



Re: [PATCH] passes: Enable split4 with selective scheduling 2 [PR98439]

2021-02-13 Thread Richard Biener
On February 13, 2021 9:58:58 AM GMT+01:00, Jakub Jelinek  
wrote:
>Hi!
>
>As mentioned in the PR, we have 5 split passes (+ splitting during
>final).
>split1 is before RA and is unconditional,
>split2 is after RA and is gated on optimize > 0,
>split3 is before sched2 and is gated on
>defined(INSN_SCHEDULING) && optimize > 0 &&
>flag_schedule_insns_after_reload
>split4 is before regstack and is gated on
>  HAVE_ATTR_length && defined (STACK_REGS) && !gate (split3)
>split5 is before shorten_branches and is gated on
>  HAVE_ATTR_length && !defined (STACK_REGS)
>and the splitting during final works only when !HAVE_ATTR_length.
>STACK_REGS is a macro enabled only on i386/x86_64.
>
>The problem with the following testcase is that split3 before sched2
>is the last splitting pass for the target/command line options set,
>but selective scheduling unlike normal scheduling can create new
>instructions that need to be split, which means we ICE during final as
>there are insns that require splitting but nothing split them.
>
>This patch fixes it by doing split4 also when -fselective-scheduling2
>is enabled on x86 and split3 has been run.  As that option isn't on
>by default, it should slow down compilation only for those that enable
>that option.
>
>Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

Ok. But if required splitting is an IL property maybe we can see sth like 
RTL_split_insns, clear it from passes like selsched and gate split on the 
property and not being present?

Thanks, 
Richard. 

>2021-02-13  Jakub Jelinek  
>
>   PR rtl-optimization/98439
>   * recog.c (pass_split_before_regstack::gate): Enable even when
>   pass_split_before_sched2 is enabled if -fselective-scheduling2 is
>   on.
>
>   * gcc.target/i386/pr98439.c: New test.
>
>--- gcc/recog.c.jj 2021-01-05 13:53:53.306683657 +0100
>+++ gcc/recog.c2021-02-12 16:42:20.202883267 +0100
>@@ -4532,8 +4532,13 @@ pass_split_before_regstack::gate (functi
>   /* If flow2 creates new instructions which need splitting
>  and scheduling after reload is not done, they might not be
>  split until final which doesn't allow splitting
>- if HAVE_ATTR_length.  */
>+ if HAVE_ATTR_length.  Selective scheduling can result in
>+ further instructions that need splitting.  */
>+#ifdef INSN_SCHEDULING
>+  return !enable_split_before_sched2 () || flag_selective_scheduling2;
>+#else
>   return !enable_split_before_sched2 ();
>+#endif
> #else
>   return false;
> #endif
>--- gcc/testsuite/gcc.target/i386/pr98439.c.jj 2021-02-12
>16:46:34.326012956 +0100
>+++ gcc/testsuite/gcc.target/i386/pr98439.c2021-02-12
>16:46:19.925175619 +0100
>@@ -0,0 +1,12 @@
>+/* PR rtl-optimization/98439 */
>+/* { dg-do compile } */
>+/* { dg-options "-march=nehalem -O2 -fselective-scheduling2
>-fno-cprop-registers" } */
>+
>+int v;
>+int bar (int, int, int, int, int, int, int);
>+
>+int
>+foo (void)
>+{
>+  return bar (0, 0, 0, 0, 0, 0, v > 0 ? v : 0);
>+}
>
>   Jakub



Re: [PATCH 2/2] sparc: Run SUBTARGET_INIT_BUILTINS if it exists

2021-02-13 Thread coypu--- via Gcc-patches
I hope that writing the detailed commit message will encourage someone
with better knowledge of GCC internals to point out a better place for
this logic. I can follow through with any suggestions :)

On Sat, Feb 13, 2021 at 12:20:30PM +, Maya Rashish wrote:
> Some subtargets don't provide the canonical function names as
> the symbol name in C libraries, and libcalls will only work if
> the builtins are patched to emit the correct library name.
> 
> For example, on NetBSD, cabsl has the symbol name __c99_cabsl,
> and the patching is done via netbsd_patch_builtin.
> 
> With this change, libgfortran.so is correctly built with a
> reference to __c99_cabsl, instead of "cabsl" which is not defined.


[PATCH 2/2] sparc: Run SUBTARGET_INIT_BUILTINS if it exists

2021-02-13 Thread Maya Rashish via Gcc-patches
Some subtargets don't provide the canonical function names as
the symbol name in C libraries, and libcalls will only work if
the builtins are patched to emit the correct library name.

For example, on NetBSD, cabsl has the symbol name __c99_cabsl,
and the patching is done via netbsd_patch_builtin.

With this change, libgfortran.so is correctly built with a
reference to __c99_cabsl, instead of "cabsl" which is not defined.

gcc/ChangeLog:
* config/sparc/sparc.c
  (sparc_init_builtins): Call SUBTARGET_INIT_BUILTINS.
---
 gcc/config/sparc/sparc.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gcc/config/sparc/sparc.c b/gcc/config/sparc/sparc.c
index f3557936114..fe6475f2520 100644
--- a/gcc/config/sparc/sparc.c
+++ b/gcc/config/sparc/sparc.c
@@ -10962,6 +10962,9 @@ sparc_init_builtins (void)
 
   if (TARGET_VIS)
 sparc_vis_init_builtins ();
+#ifdef SUBTARGET_INIT_BUILTINS
+  SUBTARGET_INIT_BUILTINS;
+#endif
 }
 
 /* Create builtin functions for FPU instructions.  */
-- 
2.28.0



[PATCH 1/2] aarch64: Run SUBTARGET_INIT_BUILTINS if it exists

2021-02-13 Thread Maya Rashish via Gcc-patches
Some subtargets don't provide the canonical function names as
the symbol name in C libraries, and libcalls will only work if
the builtins are patched to emit the correct library name.

For example, on NetBSD, cabsl has the symbol name __c99_cabsl,
and the patching is done via netbsd_patch_builtin.

With this change, libgfortran.so is correctly built with a
reference to __c99_cabsl, instead of "cabsl" which is not defined.

gcc/ChangeLog:
* config/aarch64/aarch64-builtins.c
  (aarch64_general_init_builtins): Call SUBTARGET_INIT_BUILTINS.
---
 gcc/config/aarch64/aarch64-builtins.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-builtins.c 
b/gcc/config/aarch64/aarch64-builtins.c
index 25ab866ccd4..df13c9be051 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -1459,6 +1459,10 @@ aarch64_general_init_builtins (void)
 
   if (TARGET_MEMTAG)
 aarch64_init_memtag_builtins ();
+
+#ifdef SUBTARGET_INIT_BUILTINS
+  SUBTARGET_INIT_BUILTINS;
+#endif
 }
 
 /* Implement TARGET_BUILTIN_DECL for the AARCH64_BUILTIN_GENERAL group.  */
-- 
2.28.0



Re: [PATCH] i386: Add combiner splitter to optimize V2SImode memory rotation [PR96166]

2021-02-13 Thread Jakub Jelinek via Gcc-patches
On Sat, Feb 13, 2021 at 09:59:57AM +0100, Uros Bizjak wrote:
> BTW: I wonder if this optimization triggers frequently, the code in
> the PR looks quite useful.

On my next bootstrap/regtest I can gather some statistics.

Jakub



[PATCH] match.pd: Fix up A % (cast) (pow2cst << B) simplification [PR99079]

2021-02-13 Thread Jakub Jelinek via Gcc-patches
Hi!

The (mod @0 (convert?@3 (power_of_two_cand@1 @2))) simplification
uses tree_nop_conversion_p (type, TREE_TYPE (@3)) condition, but I believe
it doesn't check what it was meant to check.  On convert?@3
TREE_TYPE (@3) is not the type of what it has been converted from, but
what it has been converted to, which needs to be (because it is operand
of normal binary operation) equal or compatible to type of the modulo
result and first operand - type.
I could fix that by using && tree_nop_conversion_p (type, TREE_TYPE (@1))
and be done with it, but actually most of the non-nop conversions are IMHO
ok and so we would regress those optimizations.
In particular, if we have say narrowing conversions (foo5 and foo6 in
the new testcase), I think we are fine, either the shift of the power of two
constant after narrowing conversion is still that power of two (or negation
of that) and then it will still work, or the result of narrowing conversion
is 0 and then we would have UB which we can ignore.
Similarly, widening conversions where the shift result is unsigned are fine,
or even widening conversions where the shift result is signed, but we sign
extend to a signed wider divisor, the problematic case of INT_MIN will
become x % (long long) INT_MIN and we can still optimize that to
x & (long long) INT_MAX.
What doesn't work is the case in the pr99079.c testcase, widening conversion
of a signed shift result to wider unsigned divisor, where if the shift
is negative, we end up with x % (unsigned long long) INT_MIN which is
x % 0x8000ULL where the divisor is not a power of two and
we can't optimize that to x & 0x7fffULL.

So, the patch rejects only the single problematic case.

Furthermore, when the shift result is signed, we were introducing UB into
a program which previously didn't have one (well, left shift into the sign
bit is UB in some language/version pairs, but it is definitely valid in
C++20 - wonder if I shouldn't move the gcc.c-torture/execute/pr99079.c
testcase to g++.dg/torture/pr99079.C and use -std=c++20), by adding that
subtraction of 1, x % (1 << 31) in C++20 is well defined, but
x & ((1 << 31) - 1) triggers UB on the subtraction.
So, the patch performs the subtraction in the unsigned type if it isn't
wrapping.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2021-02-13  Jakub Jelinek  

PR tree-optimization/99079
* match.pd (A % (pow2pcst << N) -> A & ((pow2pcst << N) - 1)): Remove
useless tree_nop_conversion_p (type, TREE_TYPE (@3)) check.  Instead
require both type and TREE_TYPE (@1) to be integral types and either
type having smaller or equal precision, or TREE_TYPE (@1) being
unsigned type, or type being signed type.  If TREE_TYPE (@1)
doesn't have wrapping overflow, perform the subtraction of one in
unsigned type.

* gcc.dg/fold-modpow2-2.c: New test.
* gcc.c-torture/execute/pr99079.c: New test.

--- gcc/match.pd.jj 2021-01-28 16:11:30.0 +0100
+++ gcc/match.pd2021-02-12 20:17:26.656857183 +0100
@@ -619,12 +619,23 @@ (define_operator_list COND_TERNARY
 (shift @0 (bit_and @1 (minus @2 { build_int_cst (TREE_TYPE (@2),
  1); }))
  (simplify
-  (mod @0 (convert?@3 (power_of_two_cand@1 @2)))
-  (if ((TYPE_UNSIGNED (type)
-   || tree_expr_nonnegative_p (@0))
-   && tree_nop_conversion_p (type, TREE_TYPE (@3))
-   && integer_pow2p (@2) && tree_int_cst_sgn (@2) > 0)
-   (bit_and @0 (convert (minus @1 { build_int_cst (TREE_TYPE (@1), 1); }))
+  (mod @0 (convert? (power_of_two_cand@1 @2)))
+  (if ((TYPE_UNSIGNED (type) || tree_expr_nonnegative_p (@0))
+   /* Allow any integral conversions of the divisor, except
+ conversion from narrower signed to wider unsigned type
+ where if @1 would be negative power of two, the divisor
+ would not be a power of two.  */
+   && INTEGRAL_TYPE_P (type)
+   && INTEGRAL_TYPE_P (TREE_TYPE (@1))
+   && (TYPE_PRECISION (type) <= TYPE_PRECISION (TREE_TYPE (@1))
+  || TYPE_UNSIGNED (TREE_TYPE (@1))
+  || !TYPE_UNSIGNED (type))
+   && integer_pow2p (@2) && tree_int_cst_sgn (@2) > 0)
+   (with { tree utype = TREE_TYPE (@1);
+  if (!TYPE_OVERFLOW_WRAPS (utype))
+utype = unsigned_type_for (utype); }
+(bit_and @0 (convert (minus (convert:utype @1)
+   { build_one_cst (utype); })))
 
 /* Simplify (unsigned t * 2)/2 -> unsigned t & 0x7FFF.  */
 (simplify
--- gcc/testsuite/gcc.dg/fold-modpow2-2.c.jj2021-02-12 19:36:45.833237766 
+0100
+++ gcc/testsuite/gcc.dg/fold-modpow2-2.c   2021-02-12 20:03:20.413315445 
+0100
@@ -0,0 +1,47 @@
+/* PR tree-optimization/99079 */
+/* { dg-do compile { target { lp64 || ilp32 } } } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+unsigned int
+foo1 (unsigned int a, unsigned int b)
+{
+  return a % (1 << b);
+}
+
+int

Re: [PATCH] i386: Add combiner splitter to optimize V2SImode memory rotation [PR96166]

2021-02-13 Thread Uros Bizjak via Gcc-patches
On Sat, Feb 13, 2021 at 9:47 AM Jakub Jelinek  wrote:
>
> Hi!
>
> Since the x86 backend enabled V2SImode vectorization (with
> TARGET_MMX_WITH_SSE), slp vectorization can kick in and emit
> movq(%rdi), %xmm1
> pshufd  $225, %xmm1, %xmm0
> movq%xmm0, (%rdi)
> instead of
> rolq$32, (%rdi)
> we used to emit (or emit when slp vectorization is disabled).
> I think the rotate is both smaller and faster, so this patch adds
> a combiner splitter to optimize that back.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2021-02-13  Jakub Jelinek  
>
> PR target/96166
> * config/i386/mmx.md (*mmx_pshufd_1): Add a combine splitter for
> swap of V2SImode elements in memory into DImode memory rotate by 32.
>
> * gcc.target/i386/pr96166.c: New test.

OK.

BTW: I wonder if this optimization triggers frequently, the code in
the PR looks quite useful.

Thanks,
Uros.

> --- gcc/config/i386/mmx.md.jj   2021-02-03 09:10:28.741347389 +0100
> +++ gcc/config/i386/mmx.md  2021-02-12 15:27:38.159393941 +0100
> @@ -2076,6 +2076,17 @@ (define_insn "*mmx_pshufd_1"
> (set_attr "length_immediate" "1")
> (set_attr "mode" "TI")])
>
> +;; Optimize V2SImode load from memory, swapping the elements and
> +;; storing back into the memory into DImode rotate of the memory by 32.
> +(define_split
> +  [(set (match_operand:V2SI 0 "memory_operand")
> +   (vec_select:V2SI (match_dup 0)
> + (parallel [(const_int 1) (const_int 0)])))]
> +  "TARGET_64BIT && (TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())"
> +  [(set (match_dup 0)
> +   (rotate:DI (match_dup 0) (const_int 32)))]
> +  "operands[0] = adjust_address (operands[0], DImode, 0);")
> +
>  (define_insn "mmx_pswapdv2si2"
>[(set (match_operand:V2SI 0 "register_operand" "=y,Yv")
> (vec_select:V2SI
> --- gcc/testsuite/gcc.target/i386/pr96166.c.jj  2021-02-12 15:36:31.251410875 
> +0100
> +++ gcc/testsuite/gcc.target/i386/pr96166.c 2021-02-12 15:37:26.467792818 
> +0100
> @@ -0,0 +1,21 @@
> +/* PR target/96166 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O3 -mtune=generic -masm=att" } */
> +/* { dg-final { scan-assembler "rolq\\s\\\$32, \\\(%\[re]di\\\)" } } */
> +
> +static inline void
> +swap (int *x, int *y)
> +{
> +  int tmp = *x;
> +  *x = *y;
> +  *y = tmp;
> +}
> +
> +void
> +bar (int (*x)[2])
> +{
> +  int y[2];
> +  __builtin_memcpy (, x, sizeof *x);
> +  swap ([0], [1]);
> +  __builtin_memcpy (x, , sizeof *x);
> +}
>
> Jakub
>


[PATCH] passes: Enable split4 with selective scheduling 2 [PR98439]

2021-02-13 Thread Jakub Jelinek via Gcc-patches
Hi!

As mentioned in the PR, we have 5 split passes (+ splitting during final).
split1 is before RA and is unconditional,
split2 is after RA and is gated on optimize > 0,
split3 is before sched2 and is gated on
  defined(INSN_SCHEDULING) && optimize > 0 && flag_schedule_insns_after_reload
split4 is before regstack and is gated on
  HAVE_ATTR_length && defined (STACK_REGS) && !gate (split3)
split5 is before shorten_branches and is gated on
  HAVE_ATTR_length && !defined (STACK_REGS)
and the splitting during final works only when !HAVE_ATTR_length.
STACK_REGS is a macro enabled only on i386/x86_64.

The problem with the following testcase is that split3 before sched2
is the last splitting pass for the target/command line options set,
but selective scheduling unlike normal scheduling can create new
instructions that need to be split, which means we ICE during final as
there are insns that require splitting but nothing split them.

This patch fixes it by doing split4 also when -fselective-scheduling2
is enabled on x86 and split3 has been run.  As that option isn't on
by default, it should slow down compilation only for those that enable
that option.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2021-02-13  Jakub Jelinek  

PR rtl-optimization/98439
* recog.c (pass_split_before_regstack::gate): Enable even when
pass_split_before_sched2 is enabled if -fselective-scheduling2 is
on.

* gcc.target/i386/pr98439.c: New test.

--- gcc/recog.c.jj  2021-01-05 13:53:53.306683657 +0100
+++ gcc/recog.c 2021-02-12 16:42:20.202883267 +0100
@@ -4532,8 +4532,13 @@ pass_split_before_regstack::gate (functi
   /* If flow2 creates new instructions which need splitting
  and scheduling after reload is not done, they might not be
  split until final which doesn't allow splitting
- if HAVE_ATTR_length.  */
+ if HAVE_ATTR_length.  Selective scheduling can result in
+ further instructions that need splitting.  */
+#ifdef INSN_SCHEDULING
+  return !enable_split_before_sched2 () || flag_selective_scheduling2;
+#else
   return !enable_split_before_sched2 ();
+#endif
 #else
   return false;
 #endif
--- gcc/testsuite/gcc.target/i386/pr98439.c.jj  2021-02-12 16:46:34.326012956 
+0100
+++ gcc/testsuite/gcc.target/i386/pr98439.c 2021-02-12 16:46:19.925175619 
+0100
@@ -0,0 +1,12 @@
+/* PR rtl-optimization/98439 */
+/* { dg-do compile } */
+/* { dg-options "-march=nehalem -O2 -fselective-scheduling2 
-fno-cprop-registers" } */
+
+int v;
+int bar (int, int, int, int, int, int, int);
+
+int
+foo (void)
+{
+  return bar (0, 0, 0, 0, 0, 0, v > 0 ? v : 0);
+}

Jakub



[PATCH] i386: Add combiner splitter to optimize V2SImode memory rotation [PR96166]

2021-02-13 Thread Jakub Jelinek via Gcc-patches
Hi!

Since the x86 backend enabled V2SImode vectorization (with
TARGET_MMX_WITH_SSE), slp vectorization can kick in and emit
movq(%rdi), %xmm1
pshufd  $225, %xmm1, %xmm0
movq%xmm0, (%rdi)
instead of
rolq$32, (%rdi)
we used to emit (or emit when slp vectorization is disabled).
I think the rotate is both smaller and faster, so this patch adds
a combiner splitter to optimize that back.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2021-02-13  Jakub Jelinek  

PR target/96166
* config/i386/mmx.md (*mmx_pshufd_1): Add a combine splitter for
swap of V2SImode elements in memory into DImode memory rotate by 32.

* gcc.target/i386/pr96166.c: New test.

--- gcc/config/i386/mmx.md.jj   2021-02-03 09:10:28.741347389 +0100
+++ gcc/config/i386/mmx.md  2021-02-12 15:27:38.159393941 +0100
@@ -2076,6 +2076,17 @@ (define_insn "*mmx_pshufd_1"
(set_attr "length_immediate" "1")
(set_attr "mode" "TI")])
 
+;; Optimize V2SImode load from memory, swapping the elements and
+;; storing back into the memory into DImode rotate of the memory by 32.
+(define_split
+  [(set (match_operand:V2SI 0 "memory_operand")
+   (vec_select:V2SI (match_dup 0)
+ (parallel [(const_int 1) (const_int 0)])))]
+  "TARGET_64BIT && (TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())"
+  [(set (match_dup 0)
+   (rotate:DI (match_dup 0) (const_int 32)))]
+  "operands[0] = adjust_address (operands[0], DImode, 0);")
+
 (define_insn "mmx_pswapdv2si2"
   [(set (match_operand:V2SI 0 "register_operand" "=y,Yv")
(vec_select:V2SI
--- gcc/testsuite/gcc.target/i386/pr96166.c.jj  2021-02-12 15:36:31.251410875 
+0100
+++ gcc/testsuite/gcc.target/i386/pr96166.c 2021-02-12 15:37:26.467792818 
+0100
@@ -0,0 +1,21 @@
+/* PR target/96166 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O3 -mtune=generic -masm=att" } */
+/* { dg-final { scan-assembler "rolq\\s\\\$32, \\\(%\[re]di\\\)" } } */
+
+static inline void
+swap (int *x, int *y)
+{
+  int tmp = *x;
+  *x = *y;
+  *y = tmp;
+}
+
+void
+bar (int (*x)[2])
+{
+  int y[2];
+  __builtin_memcpy (, x, sizeof *x);
+  swap ([0], [1]);
+  __builtin_memcpy (x, , sizeof *x);
+}

Jakub