libgo patch committed: Only compile ptrace shim on GNU/Linux

2020-10-21 Thread Ian Lance Taylor via Gcc-patches
This libgo patch by Nikhil Benesch only compiles the ptrace varargs
shim on GNU/Linux, to avoid compilation failures on some other
platforms. The C ptrace function is not entirely portable (e.g.,
NetBSD has `int data` instead of `void* data`), and so far Linux is
the only platform that needs the varargs shim.

Additionally, make the types in the ptrace and raw_ptrace function
declarations match. This makes it more clear that the only difference
between the two is that calls via the former are allowed to block
while calls via the latter are not.

Bootstrapped and ran Go testsuite on x86_64-pc-linux-gnu.  Committed
to mainline.

Ian
c3627fd56a707b6059b86e7ca018166f080b5403
diff --git a/gcc/go/gofrontend/MERGE b/gcc/go/gofrontend/MERGE
index aedaf4664d5..384ca6e8864 100644
--- a/gcc/go/gofrontend/MERGE
+++ b/gcc/go/gofrontend/MERGE
@@ -1,4 +1,4 @@
-64c25b2365f7125a32b3146618b627f26a78c1fc
+fa66bd11bbe58943e273cfa74356771c996f5b24
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
diff --git a/libgo/go/syscall/exec_bsd.go b/libgo/go/syscall/exec_bsd.go
index 7e06943f13e..ca7fdc0825e 100644
--- a/libgo/go/syscall/exec_bsd.go
+++ b/libgo/go/syscall/exec_bsd.go
@@ -93,7 +93,7 @@ func forkAndExecInChild(argv0 *byte, argv, envv []*byte, 
chroot, dir *byte, attr
 
// Enable tracing if requested.
if sys.Ptrace {
-   err1 = raw_ptrace(_PTRACE_TRACEME, 0, nil, nil)
+   err1 = raw_ptrace(_PTRACE_TRACEME, 0, 0, 0)
if err1 != 0 {
goto childerror
}
diff --git a/libgo/go/syscall/exec_linux.go b/libgo/go/syscall/exec_linux.go
index 2f0a34fef0b..38975810432 100644
--- a/libgo/go/syscall/exec_linux.go
+++ b/libgo/go/syscall/exec_linux.go
@@ -538,7 +538,7 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, 
chroot, dir *byte, att
// Do this right before exec so that we don't unnecessarily trace the 
runtime
// setting up after the fork. See issue #21428.
if sys.Ptrace {
-   err1 = raw_ptrace(_PTRACE_TRACEME, 0, nil, nil)
+   err1 = raw_ptrace(_PTRACE_TRACEME, 0, 0, 0)
if err1 != 0 {
goto childerror
}
diff --git a/libgo/go/syscall/exec_stubs.go b/libgo/go/syscall/exec_stubs.go
index e95b4158e1a..c837cf7a4e2 100644
--- a/libgo/go/syscall/exec_stubs.go
+++ b/libgo/go/syscall/exec_stubs.go
@@ -30,6 +30,6 @@ func (w WaitStatus) Signal() int { return 0 }
 func (w WaitStatus) StopSignal() int { return 0 }
 func (w WaitStatus) TrapCause() int  { return 0 }
 
-func raw_ptrace(request int, pid int, addr *byte, data *byte) Errno {
+func raw_ptrace(request int, pid int, addr uintptr, data uintptr) Errno {
return ENOSYS
 }
diff --git a/libgo/go/syscall/libcall_aix.go b/libgo/go/syscall/libcall_aix.go
index 27b469e1e47..92c7f3cc232 100644
--- a/libgo/go/syscall/libcall_aix.go
+++ b/libgo/go/syscall/libcall_aix.go
@@ -19,7 +19,7 @@ const SYS_EXECVE = 0
 //sys  ptrace64(request int, id int64, addr int64, data int, buff uintptr) 
(err error)
 //ptrace64(request _C_int, id int64, addr int64, data _C_int, buff *byte) 
_C_int
 
-func raw_ptrace(request int, pid int, addr *byte, data *byte) Errno {
+func raw_ptrace(request int, pid int, addr uintptr, data uintptr) Errno {
if request == _PTRACE_TRACEME {
// Convert to AIX ptrace call.
err := ptrace64(_PT_TRACE_ME, 0, 0, 0, 0)
diff --git a/libgo/go/syscall/libcall_glibc.go 
b/libgo/go/syscall/libcall_glibc.go
index 823343d5075..a32d6968f24 100644
--- a/libgo/go/syscall/libcall_glibc.go
+++ b/libgo/go/syscall/libcall_glibc.go
@@ -31,9 +31,6 @@ func Futimes(fd int, tv []Timeval) (err error) {
return Utimes("/proc/self/fd/"+itoa(fd), tv)
 }
 
-//sys  ptrace(request int, pid int, addr uintptr, data uintptr) (err error)
-//__go_ptrace(request _C_int, pid Pid_t, addr *byte, data *byte) _C_long
-
 //sys  accept4(fd int, sa *RawSockaddrAny, len *Socklen_t, flags int) (nfd 
int, err error)
 //accept4(fd _C_int, sa *RawSockaddrAny, len *Socklen_t, flags _C_int) _C_int
 
diff --git a/libgo/go/syscall/libcall_hurd.go b/libgo/go/syscall/libcall_hurd.go
index f0e038ca616..44ff46d1016 100644
--- a/libgo/go/syscall/libcall_hurd.go
+++ b/libgo/go/syscall/libcall_hurd.go
@@ -7,7 +7,7 @@
 package syscall
 
 // Dummy function
-func raw_ptrace(request int, pid int, addr *byte, data *byte) Errno {
+func raw_ptrace(request int, pid int, addr uintptr, data uintptr) Errno {
return ENOSYS
 }
 
diff --git a/libgo/go/syscall/libcall_irix.go b/libgo/go/syscall/libcall_irix.go
index 9b6cdcca2c8..98807668d52 100644
--- a/libgo/go/syscall/libcall_irix.go
+++ b/libgo/go/syscall/libcall_irix.go
@@ -6,5 +6,5 @@
 
 package syscall
 
-//sysnb raw_ptrace(request int, pid int, addr *byte, data *byte) (err Errno)
+//sysnb raw_ptrace(request int, pid int, addr uintptr, data uintptr) (err 
Errno)
 

Re: [Ada,FYI] revamp ada.numerics.aux

2020-10-21 Thread Alexandre Oliva
On Oct 18, 2020, Alexandre Oliva  wrote:

> The Aux_Short_Float unit is implemented in terms of the Aux_Float one,
> and the others rely on the elementary functions from the C Math
> library for float, double and long double types, respectively.


Use Aux_Long_Float for all real types on LynxOS

From: Alexandre Oliva 

Its libc does not offer *f or *l elementary functions, so rely on the
C double ones only.

Tested with the target platform, also regstrapped on x86_64-linux-gnu
along with other patches.  Approved by Eric Botcazou.


for  gcc/ada/ChangeLog

* Makefile.rtl (LIBGNAT_TARGET_PAIRS) : Rely on
Aux_Long_Float for all real types.
---
 gcc/ada/Makefile.rtl |3 +++
 1 file changed, 3 insertions(+)

diff --git a/gcc/ada/Makefile.rtl b/gcc/ada/Makefile.rtl
index 04c7cc8..efb73e5 100644
--- a/gcc/ada/Makefile.rtl
+++ b/gcc/ada/Makefile.rtl
@@ -2015,6 +2015,9 @@ ifeq ($(strip $(filter-out lynxos178%,$(target_os))),)
 
   LIBGNAT_TARGET_PAIRS = \
   a-intnam.adshttps://FSFLA.org/blogs/lxo/
Free Software Activist
GNU Toolchain Engineer


Re: [Ada,FYI] revamp ada.numerics.aux

2020-10-21 Thread Alexandre Oliva
On Oct 18, 2020, Alexandre Oliva  wrote:

> The option is provided by default, but there is an alternate version
> that doesn't, that is used for vxworks targets.

vxworks float EFs not precise enough -> use long float

From: Alexandre Oliva 

Some acats-4 tests that check the precision of Float elementary
functions fail with vxworks 7.2's implementations of single-precision
math functions.

This patch arranges for us to bypass the single-precision functions,
and use the Aux_Long_Float implementation, based on the double-typed
calls from the C library, for Float and Short_Float.

Tested on affected platforms (some regressions remain on x86 and x86_64,
so further changes are expected, but this is incremental progress), also
regstrapped on x86_64-linux-gnu along with other patches, approved by
Eric Botcazou.


for  gcc/ada/ChangeLog

* Makefile.rtl (LIBGNAT_TARGET_PAIRS): Use Long Float-based
variant of Aux_Short_Float and Aux_Float on vxworks targets.
* libgnat/a-nashfl__wraplf.ads: New.
* libgnat/a-nuaufl__wraplf.ads: New.
---
 gcc/ada/Makefile.rtl |8 +++
 gcc/ada/libgnat/a-nashfl__wraplf.ads |   87 ++
 gcc/ada/libgnat/a-nuaufl__wraplf.ads |   87 ++
 3 files changed, 182 insertions(+)
 create mode 100644 gcc/ada/libgnat/a-nashfl__wraplf.ads
 create mode 100644 gcc/ada/libgnat/a-nuaufl__wraplf.ads

diff --git a/gcc/ada/Makefile.rtl b/gcc/ada/Makefile.rtl
index fb7ecb1..04c7cc8 100644
--- a/gcc/ada/Makefile.rtl
+++ b/gcc/ada/Makefile.rtl
@@ -1026,6 +1026,8 @@ ifeq ($(strip $(filter-out powerpc% wrs vxworks 
vxworksspe vxworks7% vxworks7spe
   LIBGNAT_TARGET_PAIRS = \
   a-intnam.adshttp://www.gnu.org/licenses/>.  --
+--  --
+-- GNAT was originally developed  by the GNAT team at  New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc.  --
+--  --
+--
+
+--  This package provides the basic computational interface for the
+--  generic elementary functions. The functions in this unit are
+--  wrappers for those in the Long Float package.
+
+with Ada.Numerics.Aux_Long_Float;
+
+package Ada.Numerics.Aux_Short_Float is
+   pragma Pure;
+
+   subtype T is Short_Float;
+   package Aux renames Ada.Numerics.Aux_Long_Float;
+   subtype W is Aux.T;
+
+   --  Use the Aux implementation.
+
+   function Sin (X : T) return T
+   is (T (Aux.Sin (W (X;
+
+   function Cos (X : T) return T
+   is (T (Aux.Cos (W (X;
+
+   function Tan (X : T) return T
+   is (T (Aux.Tan (W (X;
+
+   function Exp (X : T) return T
+   is (T (Aux.Exp (W (X;
+
+   function Sqrt (X : T) return T
+   is (T (Aux.Sqrt (W (X;
+
+   function Log (X : T) return T
+   is (T (Aux.Log (W (X;
+
+   function Acos (X : T) return T
+   is (T (Aux.Acos (W (X;
+
+   function Asin (X : T) return T
+   is (T (Aux.Asin (W (X;
+
+   function Atan (X : T) return T
+   is (T (Aux.Atan (W (X;
+
+   function Sinh (X : T) return T
+   is (T (Aux.Sinh (W (X;
+
+   function Cosh (X : T) return T
+   is (T (Aux.Cosh (W (X;
+
+   function Tanh (X : T) return T
+   is (T (Aux.Tanh (W (X;
+
+   function Pow (X, Y : T) return T
+   is (T (Aux.Pow (W (X), W (Y;
+
+end Ada.Numerics.Aux_Short_Float;
diff --git a/gcc/ada/libgnat/a-nuaufl__wraplf.ads 
b/gcc/ada/libgnat/a-nuaufl__wraplf.ads
new file mode 100644
index ..b6eb22c
--- /dev/null
+++ b/gcc/ada/libgnat/a-nuaufl__wraplf.ads
@@ -0,0 +1,87 @@
+--
+--  --
+-- GNAT RUN-TIME COMPONENTS --
+--  --
+--   A D A . N U M E R I C S . A U X _ F L O A T--
+--  --
+-- S p e c  --
+--   (Double-based Version, Float)  --
+--  --
+--  Copyright (C) 1992-2020, Free Software Foundation, Inc. --
+--  --
+-- GNAT is free software;  you can  redistribute it  and/or modify it under --
+-- terms of the  GNU General Public License as published  by the Free Soft- --
+-- ware  Foundation;  either version 3,  or (at your option) any later ver- --
+-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
+-- OUT ANY WARRANTY;  without even the  

Re: [Ada,FYI] revamp ada.numerics.aux

2020-10-21 Thread Alexandre Oliva
On Oct 20, 2020, Rainer Orth  wrote:

> your patch similarly broke sparc*-sun-solaris* bootstrap.  The attached
> snippet on top of this workaround fixes that.

Thanks, here's what I'm installing, approved by me with my build
machinery maintainer hat on ;-)


Use Aux_Long_Long_Float wraplf for sparc*-sun-solaris too

From: Rainer Orth 

Like aarch64-* and ppc*-linux-gnu, sparc*-sun-solaris has
Long_Long_Float mapped to double rather than long double, so the
intrinsics in the default version of a-nallfl.ads have mismatching
types.  Adopt the wraplf workaround for it as well.


for  gcc/ada/ChangeLog

* Makefile.rtl (LIBGNAT_TARGET_PAIRS) :
Use wraplf version of a-nallfl.
---
 gcc/ada/Makefile.rtl |1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/ada/Makefile.rtl b/gcc/ada/Makefile.rtl
index 87ee3e9..fb7ecb1 100644
--- a/gcc/ada/Makefile.rtl
+++ b/gcc/ada/Makefile.rtl
@@ -1572,6 +1572,7 @@ endif
 ifeq ($(strip $(filter-out sparc% sun solaris%,$(target_cpu) $(target_vendor) 
$(target_os))),)
   LIBGNAT_TARGET_PAIRS = \
   a-intnam.adshttps://FSFLA.org/blogs/lxo/
Free Software Activist
GNU Toolchain Engineer


Re: [Ada,FYI] revamp ada.numerics.aux

2020-10-21 Thread Alexandre Oliva
On Oct 19, 2020, Alexandre Oliva  wrote:

> Hello, Andreas,
> On Oct 19, 2020, Andreas Schwab  wrote:

>> -nostdinc a-nallfl.ads -o a-nallfl.o
>> a-nallfl.ads:48:13: warning: intrinsic binding type mismatch on return value
>> a-nallfl.ads:48:13: warning: intrinsic binding type mismatch on argument 1
>> a-nallfl.ads:48:13: warning: profile of "Sin" doesn't match the builtin it 
>> binds

> Thanks for the report.  Ada's Standard.Long_Long_Float is mapped to C
> double rather than long double on this target.

Long story short, we're keeping the mapping as is at least for now.  For
the curious, it's caused by the Max_HW_Digits cap in
Set_Target.C_Type_For, in gcc/ada/set_targ.adb.

So I'm going ahead and checking this in, approved by Eric Botcazou.
(Rainer's followup patch is coming up separately but momentarily)


aarch64-* and ppc*-linux-gnu long long float/long double mismatch

From: Alexandre Oliva 

Some platforms have failed to build because long long float is mapped
to double rather than long double, and then the attempts to import
intrinsics for long double in Aux_Long_Long_Float raise warnings
turned into errors.

This patch is a work around for the mismatch, arranging for
Aux_Long_Long_Float to map to Aux_Long_Float.

for  gcc/ada/ChangeLog

* Makefile.rtl (LIBGNAT_TARGET_PAIRS): Use
a-nallfl__wraplf.ads on aarch64-* and ppc*-linux-gnu targets.
* libgnat/a-nallfl__wraplf.ads: New.
---
 gcc/ada/Makefile.rtl |6 ++
 gcc/ada/libgnat/a-nallfl__wraplf.ads |   87 ++
 2 files changed, 93 insertions(+)
 create mode 100644 gcc/ada/libgnat/a-nallfl__wraplf.ads

diff --git a/gcc/ada/Makefile.rtl b/gcc/ada/Makefile.rtl
index 898eb5d..87ee3e9 100644
--- a/gcc/ada/Makefile.rtl
+++ b/gcc/ada/Makefile.rtl
@@ -1402,6 +1402,7 @@ ifeq ($(strip $(filter-out aarch64 arm% coff wrs 
vx%,$(target_cpu) $(target_vend
 VX=vxworks7
 EH_MECHANISM=-gcc
 SIGTRAMP_OBJ=sigtramp-vxworks.o
+LIBGNAT_TARGET_PAIRS += a-nallfl.adshttp://www.gnu.org/licenses/>.  --
+--  --
+-- GNAT was originally developed  by the GNAT team at  New York University. --
+-- Extensive contributions were provided by Ada Core Technologies Inc.  --
+--  --
+--
+
+--  This package provides the basic computational interface for the
+--  generic elementary functions. The functions in this unit are
+--  wrappers for those in the Long Float package.
+
+with Ada.Numerics.Aux_Long_Float;
+
+package Ada.Numerics.Aux_Long_Long_Float is
+   pragma Pure;
+
+   subtype T is Long_Long_Float;
+   package Aux renames Ada.Numerics.Aux_Long_Float;
+   subtype W is Aux.T;
+
+   --  Use the Aux implementation.
+
+   function Sin (X : T) return T
+   is (T (Aux.Sin (W (X;
+
+   function Cos (X : T) return T
+   is (T (Aux.Cos (W (X;
+
+   function Tan (X : T) return T
+   is (T (Aux.Tan (W (X;
+
+   function Exp (X : T) return T
+   is (T (Aux.Exp (W (X;
+
+   function Sqrt (X : T) return T
+   is (T (Aux.Sqrt (W (X;
+
+   function Log (X : T) return T
+   is (T (Aux.Log (W (X;
+
+   function Acos (X : T) return T
+   is (T (Aux.Acos (W (X;
+
+   function Asin (X : T) return T
+   is (T (Aux.Asin (W (X;
+
+   function Atan (X : T) return T
+   is (T (Aux.Atan (W (X;
+
+   function Sinh (X : T) return T
+   is (T (Aux.Sinh (W (X;
+
+   function Cosh (X : T) return T
+   is (T (Aux.Cosh (W (X;
+
+   function Tanh (X : T) return T
+   is (T (Aux.Tanh (W (X;
+
+   function Pow (X, Y : T) return T
+   is (T (Aux.Pow (W (X), W (Y;
+
+end Ada.Numerics.Aux_Long_Long_Float;


-- 
Alexandre Oliva, happy hacker
https://FSFLA.org/blogs/lxo/
Free Software Activist
GNU Toolchain Engineer


Re: enable sincos optimization on cygming targets

2020-10-21 Thread Alexandre Oliva
Err, sorry, I mislabeled this patch as [FYI,Ada], but it is neither
about Ada nor pre-approved.  It does require a review before I can check
it in.

On Oct 22, 2020, Alexandre Oliva  wrote:

> for  gcc/ChangeLog

>   * config/i386/cygming.h (TARGET_LIBC_HAS_FUNCTION): Enable
>   sincos optimization.

> -#define TARGET_LIBC_HAS_FUNCTION no_c99_libc_has_function
> +#define TARGET_LIBC_HAS_FUNCTION gnu_libc_has_function
 
-- 
Alexandre Oliva, happy hacker
https://FSFLA.org/blogs/lxo/
Free Software Activist
GNU Toolchain Engineer


[FYI,Ada] enable sincos optimization on cygming targets

2020-10-21 Thread Alexandre Oliva


The sincos transformation does not take place on all platforms,
because the libc_has_function target hook disables it by default.

Current w64-mingw's math library supports sincos, sincosl and sincosf,
in 32- and 64-bit modes.  I suppose this has been this way for long,
also with mingw32 and cygwin.

This patch enables the sincos optimization on these platforms.

Tested with builds targeting mingw-w64, also regstrapped along with
other patches on x86_64-linux-gnu.  Ok to install?


for  gcc/ChangeLog

* config/i386/cygming.h (TARGET_LIBC_HAS_FUNCTION): Enable
sincos optimization.
---
 gcc/config/i386/cygming.h |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/i386/cygming.h b/gcc/config/i386/cygming.h
index 1b1ea7d..82fc4d6 100644
--- a/gcc/config/i386/cygming.h
+++ b/gcc/config/i386/cygming.h
@@ -182,7 +182,7 @@ along with GCC; see the file COPYING3.  If not see
 #define MATH_LIBRARY ""
 
 #undef TARGET_LIBC_HAS_FUNCTION
-#define TARGET_LIBC_HAS_FUNCTION no_c99_libc_has_function
+#define TARGET_LIBC_HAS_FUNCTION gnu_libc_has_function
 
 #define SIZE_TYPE (TARGET_64BIT ? "long long unsigned int" : "unsigned int")
 #define PTRDIFF_TYPE (TARGET_64BIT ? "long long int" : "int")

-- 
Alexandre Oliva, happy hacker
https://FSFLA.org/blogs/lxo/
Free Software Activist
GNU Toolchain Engineer


Re: [PATCH] [PR rtl-optimization/97249]Simplify vec_select of paradoxical subreg.

2020-10-21 Thread Hongtao Liu via Gcc-patches
On Thu, Oct 22, 2020 at 12:36 AM Segher Boessenkool
 wrote:
>
> On Wed, Oct 21, 2020 at 04:43:29PM +0100, Richard Sandiford wrote:
> > Hongtao Liu  writes:
> > > + poly_uint64 nunits
> > > +   = GET_MODE_NUNITS (GET_MODE (SUBREG_REG (trueop0)));
> > > + rtx par = trueop1;
> > > + for (int i = 0; i != l1; i++)
> > > +   {
> > > + rtx idx = XVECEXP (trueop1, 0, i);
> > > + if (!CONST_INT_P (idx)
> > > + || maybe_ge (UINTVAL (idx) + subreg_offset, nunits))
> > > +   return 0;
> > > +   }
> >
> > I think the previous version was better.  We shouldn't assume that
> > further simplification rules will fail just because the conditions
> > for this rule haven't been met.
>
> Yes.  My suggestion was to factor this big piece of code to a separate
> function, and do an early return from *that*.
>
> The patch is okay for trunk without that, with the clumsy booleans.
> Thanks Hongtao!
>
>
> Segher

Thank you both for the review, I'll commit the patch with *bool success* keeped.

-- 
BR,
Hongtao


Re: [PATCH] Put absolute address jump table in data.rel.ro.local if targets support relocations

2020-10-21 Thread HAO CHEN GUI via Gcc-patches
I had a wrong email setting and got your reply later. I modified the 
patch according to your advice. Could you please review it again? Thanks.


On 2/10/2020 上午 1:47, Richard Sandiford wrote:

Sorry for the slow review.

HAO CHEN GUI via Gcc-patches  writes:

diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
index 513fc5fe295..6f5bf8d7d73 100644
--- a/gcc/config/mips/mips.c
+++ b/gcc/config/mips/mips.c
@@ -9315,10 +9315,10 @@ mips_select_rtx_section (machine_mode mode, rtx x,
 default_function_rodata_section.  */
  
  static section *

-mips_function_rodata_section (tree decl)
+mips_function_rodata_section (tree decl, bool relocatable ATTRIBUTE_UNUSED)

Now that we're C++, it's more idiomatic to leave off the parameter name:

   mips_function_rodata_section (tree decl, bool)

Same for the rest of the patch.


@@ -2491,9 +2491,19 @@ final_scan_insn_1 (rtx_insn *insn, FILE *file, int 
optimize_p ATTRIBUTE_UNUSED,
  if (! JUMP_TABLES_IN_TEXT_SECTION)
{
  int log_align;
+ bool relocatable;
+
+ relocatable = 0;

Very minor, but simpler as:

bool relocatable = false;

Same for the later hunk.


@@ -549,16 +549,17 @@ Whatever the actual target object format, this is often good 
enough.",
   void, (tree decl, int reloc),
   default_unique_section)
  
-/* Return the readonly data section associated with function DECL.  */

+/* Return the readonly or relocated readonly data section
+   associated with function DECL.  */
  DEFHOOK
  (function_rodata_section,
- "Return the readonly data section associated with\n\
+ "Return the readonly or reloc readonly data section associated with\n\
  @samp{DECL_SECTION_NAME (@var{decl})}.\n\

Maybe add “; @var{relocatable} selects the latter over the former.”


  The default version of this function selects @code{.gnu.linkonce.r.name} if\n\
  the function's section is @code{.gnu.linkonce.t.name}, @code{.rodata.name}\n\
-if function is in @code{.text.name}, and the normal readonly-data section\n\
-otherwise.",
- section *, (tree decl),
+or @code{.data.rel.ro.name} if function is in @code{.text.name}, and\n\
+the normal readonly-data or reloc readonly data section otherwise.",
+ section *, (tree decl, bool relocatable),
   default_function_rodata_section)
  
  /* Nonnull if the target wants to override the default ".rodata" prefix

diff --git a/gcc/varasm.c b/gcc/varasm.c
index 4070f9c17e8..91ab75aed06 100644
--- a/gcc/varasm.c
+++ b/gcc/varasm.c
@@ -726,12 +726,26 @@ switch_to_other_text_partition (void)
switch_to_section (current_function_section ());
  }
  
-/* Return the read-only data section associated with function DECL.  */

+/* Return the read-only or relocated read-only data section
+   associated with function DECL.  */
  
  section *

-default_function_rodata_section (tree decl)
+default_function_rodata_section (tree decl, bool relocatable)
  {
-  if (decl != NULL_TREE && DECL_SECTION_NAME (decl))
+  const char* sname;
+  unsigned int flags;
+
+  flags = 0;
+
+  if (relocatable)
+{
+  sname = ".data.rel.ro.local";
+  flags = (SECTION_WRITE | SECTION_RELRO);
+}
+  else
+sname = ".rodata";
+
+  if (decl && DECL_SECTION_NAME (decl))
  {
const char *name = DECL_SECTION_NAME (decl);
  
@@ -744,12 +758,12 @@ default_function_rodata_section (tree decl)

  dot = strchr (name + 1, '.');
  if (!dot)
dot = name;
- len = strlen (dot) + 8;
+ len = strlen (dot) + strlen (sname) + 1;
  rname = (char *) alloca (len);
  
-	  strcpy (rname, ".rodata");

+ strcpy (rname, sname);
  strcat (rname, dot);
- return get_section (rname, SECTION_LINKONCE, decl);
+ return get_section (rname, (SECTION_LINKONCE | flags), decl);
}
/* For .gnu.linkonce.t.foo we want to use .gnu.linkonce.r.foo.  */
else if (DECL_COMDAT_GROUP (decl)
@@ -767,15 +781,18 @@ default_function_rodata_section (tree decl)
   && strncmp (name, ".text.", 6) == 0)
{
  size_t len = strlen (name) + 1;
- char *rname = (char *) alloca (len + 2);
+ char *rname = (char *) alloca (len + strlen (sname) - 5);
  
-	  memcpy (rname, ".rodata", 7);

- memcpy (rname + 7, name + 5, len - 5);
- return get_section (rname, 0, decl);
+ memcpy (rname, sname, strlen (sname));
+ memcpy (rname + strlen (sname), name + 5, len - 5);
+ return get_section (rname, flags, decl);
}
  }

Don't we need to handle the .gnu.linkonce.t. case too?  I believe
the suffix there is “.d.rel.ro.local” (replacing “.t”)

My main concern is how this interacts with non-ELF targets.
It looks like AIX/XCOFF, Darwin and Cygwin already pick
default_no_function_rodata_section, so they should be fine.
But at the moment, all the fancy stuff in default_function_rodata_section
is indirectly guarded by targetm_common.have_named_sections, with the
hook falling back to 

Re: [PATCH] RISC-V: Extend syntax for the multilib-generator

2020-10-21 Thread Jim Wilson
On Wed, Oct 21, 2020 at 7:36 PM Jim Wilson  wrote:

>
>
> On Fri, Oct 16, 2020 at 2:34 AM Kito Cheng  wrote:
>
>> +# Example 2:
>> +#  rv32imafd-ilp32d--c*b
>> +# means that, in addition to rv32imafd, these configurations can also
>> use the
>> +# rv32imafd-ilp32d libraries: rv32imafd-ilp32dc, rv32imafd-ilp32db,
>> +# rv32imafd-ilp32dcb
>>
>
> You accidentally added the c and b to the abi not the arch.
>

otherwise this looks good, and very useful.

Jim


Re: [PATCH] RISC-V: Extend syntax for the multilib-generator

2020-10-21 Thread Jim Wilson
On Fri, Oct 16, 2020 at 2:34 AM Kito Cheng  wrote:

> +# Example 2:
> +#  rv32imafd-ilp32d--c*b
> +# means that, in addition to rv32imafd, these configurations can also use
> the
> +# rv32imafd-ilp32d libraries: rv32imafd-ilp32dc, rv32imafd-ilp32db,
> +# rv32imafd-ilp32dcb
>

You accidentally added the c and b to the abi not the arch.

>
>  from __future__ import print_function
>  import sys
>  import collections
> +import itertools
> +from functools import reduce
> +
> +#
> +# TODO: Add test for this script.
> +#
>
>  arches = collections.OrderedDict()
>  abis = collections.OrderedDict()
> @@ -37,6 +49,7 @@ required = []
>  reuse = []
>
>  canonical_order = "mafdgqlcbjtpvn"
> +LONG_EXT_PREFIXES = ['z', 's', 'h', 'x']
>
>  #
>  # IMPLIED_EXT(ext) -> implied extension list.
> @@ -59,14 +72,13 @@ def arch_canonicalize(arch):
># TODO: Support extension version.
>new_arch = ""
>if arch[:5] in ['rv32e', 'rv32i', 'rv32g', 'rv64i', 'rv64g']:
> -# TODO: We should expand g to imadzifencei once we support newer spec.
> +# TODO: We should expand g to imad_zifencei once we support newer
> spec.
>  new_arch = arch[:5].replace("g", "imafd")
>else:
>  raise Exception("Unexpected arch: `%s`" % arch[:5])
>
># Find any Z, S, H or X
> -  long_ext_prefixes = ['z', 's', 'h', 'x']
> -  long_ext_prefixes_idx = map(lambda x: arch.find(x), long_ext_prefixes)
> +  long_ext_prefixes_idx = map(lambda x: arch.find(x), LONG_EXT_PREFIXES)
>
># Filter out any non-existent index.
>long_ext_prefixes_idx = list(filter(lambda x: x != -1,
> long_ext_prefixes_idx))
> @@ -93,7 +105,7 @@ def arch_canonicalize(arch):
>std_exts += list(filter(lambda x:len(x) == 1, long_exts))
>
># Multi-letter extension must be in lexicographic order.
> -  long_exts = sorted(filter(lambda x:len(x) != 1, long_exts))
> +  long_exts = list(sorted(filter(lambda x:len(x) != 1, long_exts)))
>
># Put extensions in canonical order.
>for ext in canonical_order:
> @@ -112,6 +124,85 @@ def arch_canonicalize(arch):
>  new_arch += "_" + "_".join(long_exts)
>return new_arch
>
> +#
> +# add underline for each multi-char extensions.
> +# e.g. ["a", "zfh"] -> ["a", "_zfh"]
> +#
> +def add_underline_prefix(ext):
> +  for long_ext_prefix in LONG_EXT_PREFIXES:
> +if ext.startswith(long_ext_prefix):
> +  return "_" + ext
> +
> +  return ext
> +
> +#
> +# Handle expansion operation.
> +#
> +# e.g. "a*b" -> [("a",), ("b",), ("a", "b")]
> +#  "a"   -> [("a",)]
> +#
> +def _expand_combination(ext):
> +  exts = list(ext.split("*"))
> +
> +  # No need to expand if there is no `*`.
> +  if len(exts) == 1:
> +return [(exts[0],)]
> +
> +  # Add underline to every extension.
> +  # e.g.
> +  #  _b * zvamo => _b * _zvamo
> +  exts = list(map(lambda x: '_' + x, exts))
> +
> +  # Generate combination!
> +  ext_combs = []
> +  for comb_len in range(1, len(exts)+1):
> +for ext_comb in itertools.combinations(exts, comb_len):
> +  ext_combs.append(ext_comb)
> +
> +  return ext_combs
> +
> +#
> +# Input a list and drop duplicated entry.
> +# e.g.
> +#   ["a", "b", "ab", "a"] -> ["a", "b", "ab"]
> +#
> +def unique(x):
> +  #
> +  # Drop duplicated entry.
> +  # Convert list to set and then convert back to list.
> +  #
> +  # Add sorted to prevent non-deterministic results in different env.
> +  #
> +  return list(sorted(list(set(x
> +
> +#
> +# Expand EXT string if there is any expansion operator (*).
> +# e.g.
> +#   "a*b,c" -> ["a", "b", "ab", "c"]
> +#
> +def expand_combination(ext):
> +  ext = list(filter(None, ext.split(',')))
> +
> +  # Expand combination for EXT, got lots of list.
> +  # e.g.
> +  #   a * b => [[("a",), ("b",)], [("a", "b")]]
> +  ext_combs = list(map(_expand_combination, ext))
> +
> +  # Then fold to single list.
> +  # e.g.
> +  #   [[("a",), ("b",)], [("a", "b")]] => [("a",), ("b",), ("a", "b")]
> +  ext = list(reduce(lambda x, y: x + y, ext_combs, []))
> +
> +  # Fold the tuple to string.
> +  # e.g.
> +  #   [("a",), ("b",), ("a", "b")] => ["a", "b", "ab"]
> +  ext = map(lambda e : reduce(lambda x, y: x + y, e), ext)
> +
> +  # Drop duplicated entry.
> +  ext = unique(ext)
> +
> +  return ext
> +
>  for cfg in sys.argv[1:]:
>try:
>  (arch, abi, extra, ext) = cfg.split('-')
> @@ -125,9 +216,13 @@ for cfg in sys.argv[1:]:
>arches[arch] = 1
>abis[abi] = 1
>extra = list(filter(None, extra.split(',')))
> -  ext = list(filter(None, ext.split(',')))
> -  alts = sum([[x] + [x + "_" + y for y in ext] for x in [arch] + extra],
> [])
> +  ext_combs = expand_combination(ext)
> +  alts = sum([[x] + [x + y for y in ext_combs] for x in [arch] + extra],
> [])
>alts = list(map(arch_canonicalize, alts))
> +
> +  # Drop duplicated entry.
> +  alts = unique(alts)
> +
>for alt in alts[1:]:
>  arches[alt] = 1
>  reuse.append('march.%s/mabi.%s=march.%s/mabi.%s' % (arch, abi, alt,
> abi))
> --
> 2.28.0
>
>


Re: [PATCH 1/2] [target 87767] Refactor AVX512 broadcast patterns with speical memory constraint.

2020-10-21 Thread Hongtao Liu via Gcc-patches
On Wed, Oct 21, 2020 at 11:04 PM Vladimir Makarov  wrote:
>
>
> On 2020-10-20 10:11 p.m., Hongtao Liu wrote:
> >
> > Changed, and it passed the i386/x86-64 regression test.
> >
> > Update patch.
> >
> Thank you, Hongtao.  This patch is ok for the trunk.
>
>

Thanks for the review, committed.

-- 
BR,
Hongtao


Ping: [PATCH][Arm] Auto-vectorization for MVE: vsub

2020-10-21 Thread Dennis Zhang via Gcc-patches
Ping: https://gcc.gnu.org/pipermail/gcc-patches/2020-October/555646.html
Thanks


From: Dennis Zhang 
Sent: Tuesday, October 6, 2020 5:46 PM
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov; nd; Richard Earnshaw; Ramana Radhakrishnan
Subject: Re: [PATCH][Arm] Auto-vectorization for MVE: vsub

Hi all,

On 8/17/20 6:41 PM, Dennis Zhang wrote:
>
> Hi all,
>
> This patch enables MVE vsub instructions for auto-vectorization.
> It adds RTL templates for MVE vsub instructions using 'minus' instead of
> unspec expression to make the instructions recognizable for vectorization.
> MVE target is added in sub3 optab. The sub3 optab is
> modified to use a mode iterator that selects available modes for various
> targets correspondingly.
> MVE vector modes are enabled in arm_preferred_simd_mode in arm.c to
> support vectorization.
>
> This patch also fixes 'vreinterpretq_*.c' MVE intrinsic tests. The tests
> generate wrong instruction numbers because of unexpected icf optimization.
> This bug is exposed by the MVE vector modes enabled in this patch,
> therefore it is corrected in this patch to avoid test failures.
>
> MVE instructions are documented here:
> https://developer.arm.com/architectures/instruction-sets/simd-isas/helium/helium-intrinsics
>
> The patch is regtested for arm-none-eabi and bootstrapped for
> arm-none-linux-gnueabihf.
>
> Is it OK for trunk please?
>
> Thanks
> Dennis
>
> gcc/ChangeLog:
>
> 2020-08-10  Dennis Zhang  
>
>   * config/arm/arm.c (arm_preferred_simd_mode): Enable MVE vector modes.
>   * config/arm/arm.h (TARGET_NEON_IWMMXT): New macro.
>   (TARGET_NEON_IWMMXT_MVE, TARGET_NEON_IWMMXT_MVE_FP): Likewise.
>   (TARGET_NEON_MVE_HFP): Likewise.
>   * config/arm/iterators.md (VSEL): New mode iterator to select modes
>   for corresponding targets.
>   * config/arm/mve.md (mve_vsubq): New entry for vsub instruction
>   using expression 'minus'.
>   (mve_vsubq_f): Use minus instead of VSUBQ_F unspec.
>   * config/arm/neon.md (sub3): Removed here. Integrated in the
>   sub3 in vec-common.md
>   * config/arm/vec-common.md (sub3): Enable MVE target. Use VSEL
>   to select available modes. Exclude TARGET_NEON_FP16INST from
>   TARGET_NEON statement. Intergrate TARGET_NEON_FP16INST which is
>   originally in neon.md.
>
> gcc/testsuite/ChangeLog:
>
> 2020-08-10  Dennis Zhang  
>
>   * gcc.target/arm/mve/intrinsics/vreinterpretq_f16.c: Use additional
>   option -fno-ipa-icf and change the instruction count from 8 to 16.
>   * gcc.target/arm/mve/intrinsics/vreinterpretq_f32.c: Likewise.
>   * gcc.target/arm/mve/intrinsics/vreinterpretq_s16.c: Likewise.
>   * gcc.target/arm/mve/intrinsics/vreinterpretq_s32.c: Likewise.
>   * gcc.target/arm/mve/intrinsics/vreinterpretq_s64.c: Likewise.
>   * gcc.target/arm/mve/intrinsics/vreinterpretq_s8.c: Likewise.
>   * gcc.target/arm/mve/intrinsics/vreinterpretq_u16.c: Likewise.
>   * gcc.target/arm/mve/intrinsics/vreinterpretq_u32.c: Likewise.
>   * gcc.target/arm/mve/intrinsics/vreinterpretq_u64.c: Likewise.
>   * gcc.target/arm/mve/intrinsics/vreinterpretq_u8.c: Likewise.
>   * gcc.target/arm/mve/mve.exp: Include tests in subdir 'vect'.
>   * gcc.target/arm/mve/vect/vect_sub_0.c: New test.
>   * gcc.target/arm/mve/vect/vect_sub_1.c: New test.
>

This patch is updated based on Richard Sandiford's patch adding new
vector mode macros:
https://gcc.gnu.org/pipermail/gcc-patches/2020-September/553425.html
The old version of this patch is at
https://gcc.gnu.org/pipermail/gcc-patches/2020-August/552104.html
And a less related part in the old version is separated into another
patch: https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554100.html

This patch enables MVE vsub instructions for auto-vectorization.
It adds insns for MVE vsub instructions using 'minus' instead of unspec
expression to make the instructions recognizable for auto-vectorization.
The sub3 in mve.md is modified to use new mode macros which make
the expander available when certain modes are supported. Then various
targets can share this expander for vectorization. The redundant
sub3 insns in neon.md are then removed.

Regression tested on arm-none-eabi and bootstraped on
arm-none-linux-gnueabihf.

Is it OK for trunk please?

Thanks
Dennis

gcc/ChangeLog:

2020-10-02  Dennis Zhang  

* config/arm/mve.md (mve_vsubq): New entry for vsub instruction
using expression 'minus'.
(mve_vsubq_f): Use minus instead of VSUBQ_F unspec.
* config/arm/neon.md (*sub3_neon): Use the new mode macros
ARM_HAVE__ARITH.
(sub3, sub3_fp16): Removed.
(neon_vsub): Use gen_sub3 instead of gen_sub3_fp16.
* config/arm/vec-common.md (sub3): Use the new mode macros
ARM_HAVE__ARITH.

gcc/testsuite/ChangeLog:

2020-10-02  Dennis Zhang  

* gcc.target/arm/simd/mve-vsub_1.c: New test.



Re: [PATCH][Arm] Auto-vectorization for MVE: vmin/vmax

2020-10-21 Thread Dennis Zhang via Gcc-patches
Hi Kyrylo,

> 
> From: Kyrylo Tkachov 
> Sent: Wednesday, October 14, 2020 10:15 AM
> To: Dennis Zhang; gcc-patches@gcc.gnu.org
> Cc: nd; Richard Earnshaw; Ramana Radhakrishnan
> Subject: RE: [PATCH][Arm] Auto-vectorization for MVE: vmin/vmax
>
> Hi Dennis,
>
> > -Original Message-
> > From: Dennis Zhang 
> > Sent: 06 October 2020 17:59
> > To: gcc-patches@gcc.gnu.org
> > Cc: Kyrylo Tkachov ; nd ;
> > Richard Earnshaw ; Ramana Radhakrishnan
> > 
> > Subject: [PATCH][Arm] Auto-vectorization for MVE: vmin/vmax
> >
> > Hi all,
> >
> > This patch enables MVE vmin/vmax instructions for auto-vectorization.
> > MVE target is included in expander smin3, umin3,
> > smax3
> > and umax3 for vectorization.
> > Related insns for vmin/vmax in mve.md are modified to use smin, umin,
> > smax and umax expressions instead of unspec to support the expanders.
> >
> > Regression tested on arm-none-eabi and bootstraped on
> > arm-none-linux-gnueabihf.
> >
> > Is it OK for trunk please?
>
> Ok.
> Thanks,
> Kyrill
>

Thanks for your approval.
This patch has been committed to trunk at 
76835dca95ab9f3f106a0db1e6152ad0740b38b3

Cheers
Dennis

Move nested function info out of cgraph_node

2020-10-21 Thread Jan Hubicka
Hi,
this patch moves nested function information out of cgraph_node (to a summary).
This saves memory (especially at WPA time) and also makes nested function
support more contained.

Bootstrapped/regtested x86_64-linux.  Plan to commit it tomorrow.

gcc/ChangeLog:

2020-10-22  Jan Hubicka  

* cgraph.c: Include tree-nested.h
(cgraph_node::create): Call maybe_record_nested_function.
(cgraph_node::remove): Do not remove function from nested function
infos.
(cgraph_node::dump): Update.
(cgraph_node::unnest): Move to tree-nested.c
(cgraph_node::verify_node): Update.
(cgraph_c_finalize): Call nested_function_info::release.
* cgraph.h (struct symtab_node): Remove nested function info.
* cgraphclones.c (cgraph_node::create_clone): Do not clone nested
function info.
* cgraphunit.c (cgraph_node::analyze): Update.
(cgraph_node::expand): Do not worry about nested functions; they are
lowered.
(symbol_table::finalize_compilation_unit): Call
nested_function_info::release.
* gimplify.c: Include tree-nested.h
(unshare_body): Update.
(unvisit_body): Update.
* omp-offload.c (omp_discover_implicit_declare_target): Update.
* tree-nested.c: Include alloc-pool.h, tree-nested.h, symbol-summary.h
(nested_function_sum): New static variable.
(nested_function_info::get): New member function.
(nested_function_info::get_create): New member function.
(unnest_function): New function.
(nested_function_info::~nested_function_info): New member function.
(nested_function_info::release): New function.
(maybe_record_nested_function): New function.
(lookup_element_for_decl): Update.
(check_for_nested_with_variably_modified): Update.
(create_nesting_tree): Update.
(unnest_nesting_tree_1): Update.
(gimplify_all_functions): Update.
(lower_nested_functions): Update.
* tree-nested.h (class nested_function_info): New class.
(maybe_record_nested_function): Declare.
(unnest_function): Declare.
(first_nested_function): New inline function.
(next_nested_function): New inline function.
(nested_function_origin): New inline function.

gcc/ada/ChangeLog:

2020-10-22  Jan Hubicka  

* gcc-interface/trans.c: Include tree-nested.h
(walk_nesting_tree): Update for new nested function info.

gcc/c-family/ChangeLog:

2020-10-22  Jan Hubicka  

* c-gimplify.c: Include tree-nested.h
(c_genericize): Update for new nested function info.

gcc/d/ChangeLog:

2020-10-22  Jan Hubicka  

* decl.cc: Include tree-nested.h
(get_symbol_decl): Update for new nested function info.

diff --git a/gcc/ada/gcc-interface/trans.c b/gcc/ada/gcc-interface/trans.c
index f03d591a323..6babbd41d52 100644
--- a/gcc/ada/gcc-interface/trans.c
+++ b/gcc/ada/gcc-interface/trans.c
@@ -50,6 +50,7 @@
 #include "gomp-constants.h"
 #include "stringpool.h"
 #include "attribs.h"
+#include "tree-nested.h"
 
 #include "ada.h"
 #include "adadecode.h"
@@ -3696,7 +3697,8 @@ finalize_nrv_unc_r (tree *tp, int *walk_subtrees, void 
*data)
 static void
 walk_nesting_tree (struct cgraph_node *node, walk_tree_fn func, void *data)
 {
-  for (node = node->nested; node; node = node->next_nested)
+  for (node = first_nested_function (node);
+   node; node = next_nested_function (node))
 {
   walk_tree_without_duplicates (_SAVED_TREE (node->decl), func, data);
   walk_nesting_tree (node, func, data);
diff --git a/gcc/c-family/c-gimplify.c b/gcc/c-family/c-gimplify.c
index d1e391590dd..a7c0ec3be0d 100644
--- a/gcc/c-family/c-gimplify.c
+++ b/gcc/c-family/c-gimplify.c
@@ -39,6 +39,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "langhooks.h"
 #include "dumpfile.h"
 #include "c-ubsan.h"
+#include "tree-nested.h"
 
 /*  The gimplification pass converts the language-dependent trees
 (ld-trees) emitted by the parser into language-independent trees
@@ -572,7 +573,8 @@ c_genericize (tree fndecl)
 
   /* Dump all nested functions now.  */
   cgn = cgraph_node::get_create (fndecl);
-  for (cgn = cgn->nested; cgn ; cgn = cgn->next_nested)
+  for (cgn = first_nested_function (cgn);
+   cgn; cgn = next_nested_function (cgn))
 c_genericize (cgn->decl);
 }
 
diff --git a/gcc/cgraph.c b/gcc/cgraph.c
index f018020fa4b..9480935ff84 100644
--- a/gcc/cgraph.c
+++ b/gcc/cgraph.c
@@ -64,6 +64,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "selftest.h"
 #include "tree-into-ssa.h"
 #include "ipa-inline.h"
+#include "tree-nested.h"
 
 /* FIXME: Only for PROP_loops, but cgraph shouldn't have to know about this.  
*/
 #include "tree-pass.h"
@@ -517,13 +518,8 @@ cgraph_node::create (tree decl)
 node->ifunc_resolver = true;
 
   node->register_symbol ();
+  maybe_record_nested_function (node);
 
-  if (DECL_CONTEXT 

[PATCH] Handle a_2= properly in range calculations.

2020-10-21 Thread Andrew MacLeod via Gcc-patches

Pick up the correct type for the RHS of  a_2 = 

bootstrapped on  x86_64-pc-linux-gnu, no regressions, pushed.

Andrew
commit 966fdb2e12c0347aa3f9efaf5f4e1cd8237fa024
Author: Andrew MacLeod 
Date:   Wed Oct 21 20:11:16 2020 -0400

Handle a_2=  properly in range calculations.

when processing assignments, we were using the type of b instead of type
of  when computing a range.  This was usually filtered out by FRE.
turning it off exposed it.

gcc/
PR tree-optimization/97520
* gimple-range.cc (range_of_non_trivial_assignment): Handle x = 
by returning a non-zero range.
gcc/testsuite/
* gcc.dg/pr97520.c: New.

diff --git a/gcc/gimple-range.cc b/gcc/gimple-range.cc
index c5520e0700b..267ebad757f 100644
--- a/gcc/gimple-range.cc
+++ b/gcc/gimple-range.cc
@@ -446,17 +446,31 @@ gimple_ranger::range_of_non_trivial_assignment (irange , gimple *stmt)
 return false;
 
   tree base = gimple_range_base_of_assignment (stmt);
-  if (base && TREE_CODE (base) == MEM_REF
-  && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME)
+  if (base)
 {
-  int_range_max range1;
-  tree ssa = TREE_OPERAND (base, 0);
-  if (range_of_expr (range1, ssa, stmt))
+  if (TREE_CODE (base) == MEM_REF)
 	{
-	  tree type = TREE_TYPE (ssa);
-	  range_operator *op = range_op_handler (POINTER_PLUS_EXPR, type);
-	  int_range<2> offset (TREE_OPERAND (base, 1), TREE_OPERAND (base, 1));
-	  op->fold_range (r, type, range1, offset);
+	  if (TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME)
+	{
+	  int_range_max range1;
+	  tree ssa = TREE_OPERAND (base, 0);
+	  if (range_of_expr (range1, ssa, stmt))
+		{
+		  tree type = TREE_TYPE (ssa);
+		  range_operator *op = range_op_handler (POINTER_PLUS_EXPR,
+			 type);
+		  int_range<2> offset (TREE_OPERAND (base, 1),
+   TREE_OPERAND (base, 1));
+		  op->fold_range (r, type, range1, offset);
+		  return true;
+		}
+	}
+	  return false;
+	}
+  if (gimple_assign_rhs_code (stmt) == ADDR_EXPR)
+	{
+	  // Handle "= "  and return non-zero.
+	  r = range_nonzero (TREE_TYPE (gimple_assign_rhs1 (stmt)));
 	  return true;
 	}
 }
diff --git a/gcc/testsuite/gcc.dg/pr97520.c b/gcc/testsuite/gcc.dg/pr97520.c
new file mode 100644
index 000..9f665959138
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr97520.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-tree-fre" } */
+
+char a;
+void b() {
+  char *c[5];
+  char *d = 
+  
+  *(c[4] = d);
+}
+int main() { return 0; }


Re: [PATCH][Arm] Auto-vectorization for MVE: vmul

2020-10-21 Thread Dennis Zhang via Gcc-patches
Hi kyrylo,

> 
> From: Kyrylo Tkachov 
> Sent: Wednesday, October 14, 2020 10:14 AM
> To: Dennis Zhang; gcc-patches@gcc.gnu.org
> Cc: nd; Richard Earnshaw; Ramana Radhakrishnan
> Subject: RE: [PATCH][Arm] Auto-vectorization for MVE: vmul
> 
> Hi Dennis,
> 
> > -Original Message-
> > From: Dennis Zhang 
> > Sent: 06 October 2020 17:55
> > To: gcc-patches@gcc.gnu.org
> > Cc: Kyrylo Tkachov ; nd ;
> > Richard Earnshaw ; Ramana Radhakrishnan
> > 
> > Subject: [PATCH][Arm] Auto-vectorization for MVE: vmul
> >
> > Hi all,
> >
> > This patch enables MVE vmul instructions for auto-vectorization.
> > It includes MVE in expander mul3 to enable vectorization for MVE 
> > and modifies related vmul insns to support the expander by using 'mult'
> > instead of unspec.
> > The mul3 for vectorization in vec-common.md uses mode iterator
> > VDQWH instead of VALLW to cover all supported modes.
> > The macros ARM_HAVE__ARITH are used to select supported
> > modes for 
> > different targets. The redundant mul3 in neon.md is removed.
> >
> > Regression tested on arm-none-eabi and bootstraped on
> > arm-none-linux-gnueabihf.
> >
> > Is it OK for trunk please?
> 
> Ok, thank you for your patience.
> Kyrill
> 

Thanks for your approval.
It's committed to trunk at 0f41b5e02fa47db2080b77e4e1f7cd3305457c05

Cheers
Dennis


[PATCH] Check for undefined before not returning a constant value

2020-10-21 Thread Andrew MacLeod via Gcc-patches
Full comments in the PR, but basically substitute and fold was expecting 
to see a constant returned for any range which globally evaluates to a 
constant.

This allowed it to replace all uses of an ssa name as they were encountered.

The ranger model can return UNDEFINED for the range of names in blocks 
which are unreachable..  the patch simply overrides the UNDEFINED value 
in folders query if the global value is a constant so that 
subst_and_fold will get what it is expecting.


Bootstrapped on  x86_64-pc-linux-gnu, no regressions, pushed.

Andrew
commit 0d0bbb379d7e5f029d0fb05465fac996493e7850
Author: Andrew MacLeod 
Date:   Wed Oct 21 19:55:28 2020 -0400

Check for undefined before not returning a constant value

Don't return UNDEFINED for a range in an unreachable block if the global
value evaluates to a constant.  Return that constant instead.

PR tree-optimization/97515
* value-query.cc (range_query::value_of_expr): If the result is
UNDEFINED, check to see if the global value is a constant.
(range_query::value_on_edge): Ditto.

diff --git a/gcc/testsuite/gcc.dg/pr97515.c b/gcc/testsuite/gcc.dg/pr97515.c
new file mode 100644
index 000..2b6185ec90b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr97515.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+int
+e7 (int gg)
+{
+  int xe = 0;
+
+  while (xe < 1)
+{
+  int ui;
+
+  ui = ~xe;
+  if (ui == 0)
+ui = xe >> gg;
+
+  xe %= !ui;
+}
+
+  return xe;
+}
diff --git a/gcc/value-query.cc b/gcc/value-query.cc
index 5370a23fe18..23ba48d73a7 100644
--- a/gcc/value-query.cc
+++ b/gcc/value-query.cc
@@ -82,8 +82,16 @@ range_query::value_of_expr (tree name, gimple *stmt)
 
   if (!irange::supports_type_p (TREE_TYPE (name)))
 return NULL_TREE;
-  if (range_of_expr (r, name, stmt) && r.singleton_p ())
-return t;
+
+  if (range_of_expr (r, name, stmt))
+{
+  // A constant used in an unreachable block oftens returns as UNDEFINED.
+  // If the result is undefined, check the global value for a constant.
+  if (r.undefined_p ())
+	range_of_expr (r, name);
+  if (r.singleton_p ())
+	return t;
+}
   return NULL_TREE;
 }
 
@@ -95,8 +103,15 @@ range_query::value_on_edge (edge e, tree name)
 
   if (!irange::supports_type_p (TREE_TYPE (name)))
 return NULL_TREE;
-  if (range_on_edge (r, e, name) && r.singleton_p ())
-return t;
+  if (range_on_edge (r, e, name))
+{
+  // A constant used in an unreachable block oftens returns as UNDEFINED.
+  // If the result is undefined, check the global value for a constant.
+  if (r.undefined_p ())
+	range_of_expr (r, name);
+  if (r.singleton_p ())
+	return t;
+}
   return NULL_TREE;
 
 }


libgo patch committed: Update BSD socket code

2020-10-21 Thread Ian Lance Taylor via Gcc-patches
This libgo patch by Nikhil Benesch imports the  upstream code for BSD
sockets and sysctls into the syscall package.  This code is
sufficiently different that it doesn't get merged by the scripts.
Bootstrapped and ran Go testsuite on x86_64-pc-linux-gnu.  Committed
to mainline.

Ian
d031aa7b3be4155bf83ed55d43da6900f091cb80
diff --git a/gcc/go/gofrontend/MERGE b/gcc/go/gofrontend/MERGE
index fb7aa3e6eae..aedaf4664d5 100644
--- a/gcc/go/gofrontend/MERGE
+++ b/gcc/go/gofrontend/MERGE
@@ -1,4 +1,4 @@
-b2be94556bbc98f565fc277e30a038c742bf28a4
+64c25b2365f7125a32b3146618b627f26a78c1fc
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
diff --git a/libgo/go/syscall/route_bsd.go b/libgo/go/syscall/route_bsd.go
index b364eeaba5d..0c32594783c 100644
--- a/libgo/go/syscall/route_bsd.go
+++ b/libgo/go/syscall/route_bsd.go
@@ -18,7 +18,7 @@ var (
 
 // Round the length of a raw sockaddr up to align it properly.
 func rsaAlignOf(salen int) int {
-   salign := sizeofPtr
+   salign := int(sizeofPtr)
if darwin64Bit {
// Darwin kernels require 32-bit aligned access to
// routing facilities.
diff --git a/libgo/go/syscall/socket_bsd.go b/libgo/go/syscall/socket_bsd.go
index 40637bc7818..b230a3212e6 100644
--- a/libgo/go/syscall/socket_bsd.go
+++ b/libgo/go/syscall/socket_bsd.go
@@ -13,6 +13,7 @@ import "unsafe"
 const SizeofSockaddrInet4 = 16
 const SizeofSockaddrInet6 = 28
 const SizeofSockaddrUnix = 110
+const SizeofSockaddrDatalink = 20
 
 type RawSockaddrInet4 struct {
Lenuint8
@@ -76,6 +77,46 @@ func (sa *RawSockaddrUnix) adjustAbstract(sl Socklen_t) 
Socklen_t {
return sl
 }
 
+type SockaddrDatalink struct {
+   Lenuint8
+   Family uint8
+   Index  uint16
+   Type   uint8
+   Nlen   uint8
+   Alen   uint8
+   Slen   uint8
+   Data   [12]int8
+   rawRawSockaddrDatalink
+}
+
+func (sa *SockaddrDatalink) sockaddr() (*RawSockaddrAny, Socklen_t, error) {
+   if sa.Index == 0 {
+   return nil, 0, EINVAL
+   }
+   sa.raw.Len = sa.Len
+   sa.raw.Family = AF_LINK
+   sa.raw.Index = sa.Index
+   sa.raw.Type = sa.Type
+   sa.raw.Nlen = sa.Nlen
+   sa.raw.Alen = sa.Alen
+   sa.raw.Slen = sa.Slen
+   for i := 0; i < len(sa.raw.Data); i++ {
+   sa.raw.Data[i] = sa.Data[i]
+   }
+   return (*RawSockaddrAny)(unsafe.Pointer()), 
SizeofSockaddrDatalink, nil
+}
+
+type RawSockaddrDatalink struct {
+   Lenuint8
+   Family uint8
+   Index  uint16
+   Type   uint8
+   Nlen   uint8
+   Alen   uint8
+   Slen   uint8
+   Data   [12]int8
+}
+
 type RawSockaddr struct {
Lenuint8
Family uint8
diff --git a/libgo/go/syscall/syscall_netbsd.go 
b/libgo/go/syscall/syscall_netbsd.go
index c67550a011d..bbc6799e3e6 100644
--- a/libgo/go/syscall/syscall_netbsd.go
+++ b/libgo/go/syscall/syscall_netbsd.go
@@ -17,3 +17,64 @@ func direntReclen(buf []byte) (uint64, bool) {
 func direntNamlen(buf []byte) (uint64, bool) {
return readInt(buf, unsafe.Offsetof(Dirent{}.Namlen), 
unsafe.Sizeof(Dirent{}.Namlen))
 }
+
+func sysctlNodes(mib []_C_int) (nodes []Sysctlnode, err error) {
+   var olen uintptr
+
+   // Get a list of all sysctl nodes below the given MIB by performing
+   // a sysctl for the given MIB with CTL_QUERY appended.
+   mib = append(mib, CTL_QUERY)
+   qnode := Sysctlnode{Flags: SYSCTL_VERS_1}
+   qp := (*byte)(unsafe.Pointer())
+   sz := unsafe.Sizeof(qnode)
+   if err = sysctl(mib, nil, , qp, sz); err != nil {
+   return nil, err
+   }
+
+   // Now that we know the size, get the actual nodes.
+   nodes = make([]Sysctlnode, olen/sz)
+   np := (*byte)(unsafe.Pointer([0]))
+   if err = sysctl(mib, np, , qp, sz); err != nil {
+   return nil, err
+   }
+
+   return nodes, nil
+}
+
+func nametomib(name string) (mib []_C_int, err error) {
+   // Split name into components.
+   var parts []string
+   last := 0
+   for i := 0; i < len(name); i++ {
+   if name[i] == '.' {
+   parts = append(parts, name[last:i])
+   last = i + 1
+   }
+   }
+   parts = append(parts, name[last:])
+
+   // Discover the nodes and construct the MIB OID.
+   for partno, part := range parts {
+   nodes, err := sysctlNodes(mib)
+   if err != nil {
+   return nil, err
+   }
+   for _, node := range nodes {
+   n := make([]byte, 0)
+   for i := range node.Name {
+   if node.Name[i] != 0 {
+   n = append(n, byte(node.Name[i]))
+   }
+   }
+   if string(n) == part {
+ 

Increase weight of builtin_constant_p hint

2020-10-21 Thread Jan Hubicka
hi,
this patch makes builtin_constant_p hint to combine with other loop hints
we already support.  This is necessary for it to be realy effective
since most such functions will already get big_speedup hint.

Bootstrapped/regtested x86_64-linux, comitted.

gcc/ChangeLog:

2020-10-22  Jan Hubicka  

PR ipa/97445
* ipa-inline.c (inline_insns_single): Add hint2 parameter.
(inline_insns_auto): Add hint2 parameter.
(can_inline_edge_by_limits_p): Update.
(want_inline_small_function_p): Update.
(wrapper_heuristics_may_apply): Update.

diff --git a/gcc/ipa-inline.c b/gcc/ipa-inline.c
index bc846eabb58..9555c607a3d 100644
--- a/gcc/ipa-inline.c
+++ b/gcc/ipa-inline.c
@@ -398,26 +398,42 @@ can_inline_edge_p (struct cgraph_edge *e, bool report,
   return inlinable;
 }
 
-/* Return inlining_insns_single limit for function N. If HINT is true
+/* Return inlining_insns_single limit for function N.  If HINT or HINT2 is true
scale up the bound.  */
 
 static int
-inline_insns_single (cgraph_node *n, bool hint)
+inline_insns_single (cgraph_node *n, bool hint, bool hint2)
 {
-  if (hint)
+  if (hint && hint2)
+{
+  int64_t spd = opt_for_fn (n->decl, param_inline_heuristics_hint_percent);
+  spd = spd * spd;
+  if (spd > 10)
+   spd = 10;
+  return opt_for_fn (n->decl, param_max_inline_insns_single) * spd / 100;
+}
+  if (hint || hint2)
 return opt_for_fn (n->decl, param_max_inline_insns_single)
   * opt_for_fn (n->decl, param_inline_heuristics_hint_percent) / 100;
   return opt_for_fn (n->decl, param_max_inline_insns_single);
 }
 
-/* Return inlining_insns_auto limit for function N. If HINT is true
+/* Return inlining_insns_auto limit for function N.  If HINT or HINT2 is true
scale up the bound.   */
 
 static int
-inline_insns_auto (cgraph_node *n, bool hint)
+inline_insns_auto (cgraph_node *n, bool hint, bool hint2)
 {
   int max_inline_insns_auto = opt_for_fn (n->decl, 
param_max_inline_insns_auto);
-  if (hint)
+  if (hint && hint2)
+{
+  int64_t spd = opt_for_fn (n->decl, param_inline_heuristics_hint_percent);
+  spd = spd * spd;
+  if (spd > 10)
+   spd = 10;
+  return max_inline_insns_auto * spd / 100;
+}
+  if (hint || hint2)
 return max_inline_insns_auto
   * opt_for_fn (n->decl, param_inline_heuristics_hint_percent) / 100;
   return max_inline_insns_auto;
@@ -566,8 +582,8 @@ can_inline_edge_by_limits_p (struct cgraph_edge *e, bool 
report,
  int growth = estimate_edge_growth (e);
  if (growth > opt_for_fn (caller->decl, param_max_inline_insns_size)
  && (!DECL_DECLARED_INLINE_P (callee->decl)
- && growth >= MAX (inline_insns_single (caller, false),
-   inline_insns_auto (caller, false
+ && growth >= MAX (inline_insns_single (caller, false, false),
+   inline_insns_auto (caller, false, false
{
  e->inline_failed = CIF_OPTIMIZATION_MISMATCH;
  inlinable = false;
@@ -806,7 +822,7 @@ inlining_speedup (struct cgraph_edge *edge,
 }
 
 /* Return true if the speedup for inlining E is bigger than
-   PARAM_MAX_INLINE_MIN_SPEEDUP.  */
+   param_inline_min_speedup.  */
 
 static bool
 big_speedup_p (struct cgraph_edge *e)
@@ -855,7 +871,7 @@ want_inline_small_function_p (struct cgraph_edge *e, bool 
report)
   && (!e->count.ipa ().initialized_p () || !e->maybe_hot_p ()))
   && ipa_fn_summaries->get (callee)->min_size
- ipa_call_summaries->get (e)->call_stmt_size
- > inline_insns_auto (e->caller, true))
+ > inline_insns_auto (e->caller, true, true))
 {
   e->inline_failed = CIF_MAX_INLINE_INSNS_AUTO_LIMIT;
   want_inline = false;
@@ -864,7 +880,7 @@ want_inline_small_function_p (struct cgraph_edge *e, bool 
report)
|| e->count.ipa ().nonzero_p ())
   && ipa_fn_summaries->get (callee)->min_size
- ipa_call_summaries->get (e)->call_stmt_size
- > inline_insns_single (e->caller, true))
+ > inline_insns_single (e->caller, true, true))
 {
   e->inline_failed = (DECL_DECLARED_INLINE_P (callee->decl)
  ? CIF_MAX_INLINE_INSNS_SINGLE_LIMIT
@@ -875,11 +891,14 @@ want_inline_small_function_p (struct cgraph_edge *e, bool 
report)
 {
   int growth = estimate_edge_growth (e);
   ipa_hints hints = estimate_edge_hints (e);
+  /* We have two independent groups of hints.  If one matches in each
+of groups the limits are inreased.  If both groups matches, limit
+is increased even more.  */
   bool apply_hints = (hints & (INLINE_HINT_indirect_call
   | INLINE_HINT_known_hot
   | INLINE_HINT_loop_iterations
-  | INLINE_HINT_loop_stride
- 

Re: [PATCH 2/8] [RS6000] rs6000_rtx_costs for AND

2020-10-21 Thread Alan Modra via Gcc-patches
On Wed, Oct 21, 2020 at 03:29:11PM -0500, Segher Boessenkool wrote:
> Anyway:
> 
> +  || (outer_code == AND
> +  && rs6000_is_valid_2insn_and (x, mode)))
> {
>   *total = COSTS_N_INSNS (1);
>   return true;
> 
> It should return COSTS_N_INSNS (2) for that?

No, it should not!

  /* (reg) is costed at zero by rtlanal.c:rtx_cost.  That sets a
 baseline for rtx costs:  If a constant is valid in an insn,
 it is free.  */

-- 
Alan Modra
Australia Development Lab, IBM


Re: [PATCH] Implement no_stack_protect attribute.

2020-10-21 Thread Nick Desaulniers via Gcc-patches
+ correct kernel mailing list this time.

On Wed, Oct 21, 2020 at 2:33 PM Nick Desaulniers
 wrote:
>
> Thanks for the quick feedback!
>
> On Wed, Oct 21, 2020 at 2:13 PM Jakub Jelinek  wrote:
> >
> > On Wed, Oct 21, 2020 at 02:04:15PM -0700, Nick Desaulniers via Gcc-patches 
> > wrote:
> > > Tangentially related question:
> > > We're running into a bug related to LTO for the kernel when code
> > > compiled with -fno-stack-protector is called from and inlined into
> > > code that is compiled with -fstack-protector.  Specifically, stack
> > > canaries get checked before they're restored post suspend/resume which
> > > leads to spooky bugs.
> > >
> > > Once we have more fine grain function level attribute to explicitly
> > > disable stack protectors on a per function basis, I'm considering
> > > making this function attribute a barrier to inlining in LLVM so that
> > > callers with stack protectors don't inline callees that explicitly
> > > should not have a stack protector and vice versa (more concretely,
> > > when they don't match).  I think this would maximize which functions
> > > are still covered by stack protectors, and be the most straightforward
> > > to implement.
> >
> > That doesn't make sense to me.
> > Stack protector doesn't affect in any way inlined code, the stack protection
> > is always solely in the prologue and epilogue of out of line functions.
> > So, if the (non-inlined) caller is -fstack-protector and inlined callee
> > is -fno-stack-protector, there should be ssp store in the prologue of the
> > resulting function and test in the epilogue.
>
> That is the case today, and I'm arguing that leads to bugs in the
> Linux kernel when built with LTO.
>
> > The effect will be exactly
> > like that if the function wouldn't be inlined.
>
> I don't follow.  If the -fno-stack-protector callee was not inlined,
> the caller would have a stack protector, while the callee would not.
> I think today there's not a strong enough distinction between the
> level of stack protection being specified vs explicit
> annotations/flags that stack protectors MUST NOT be inserted.
>
> > Similarly, if the non-inlined caller is -fno-stack-protector and inlined
> > callee is -fstack-protector, there will be no stack protection.  This isn't
>
> And I'd argue that now we may have stripped off stack protection in
> the pursuit of inlining.  Consider for example the case where that
> stack protected callee contained a large alloca; post inlining into a
> -fno-stack-protected caller suddenly now it doesn't.  Oops?
>
> Wouldn't it be safer to just prevent inlining, then the callee retains
> the stack protector, regardless of caller stack protection?
>
> > exactly the effect one would get without the inlining (as in that case
> > the callee would check it), but matches the general behavior that we allow
> > inlining functions with -fstack-protector* at all (and only check it in the
> > prologue/epilogue, not somewhere in the middle).
> >
> > Jakub
> >
>
>
> --
> Thanks,
> ~Nick Desaulniers



-- 
Thanks,
~Nick Desaulniers


Re: [PATCH] c++: Handle RANGE_EXPR indexes in init_subob_ctx [PR97328]

2020-10-21 Thread Jason Merrill via Gcc-patches

On 10/21/20 5:46 PM, Patrick Palka wrote:

On Wed, 21 Oct 2020, Jason Merrill wrote:


On 10/8/20 4:47 PM, Patrick Palka wrote:

In the testcase below, we're ICEing during constexpr evaluation of the
CONSTRUCTOR {.data={{}, [1 ... 7]={}}} of type 'vector'.  The apparently
unique thing about this CONSTRUCTOR is that it has a RANGE_EXPR index
whose corresponding sub-aggregate initializer doesn't satisfy
reduced_constant_expression_p (because its field 't' is uninitialized).

This is a problem because init_subob_ctx currently punts if the
constructor index is a RANGE_EXPR, so when cxx_eval_bare_aggregate
recurses into this sub-aggregate initializer we trip over the
same_type_p assert in verify_ctor_sanity.

Fix this by making init_subob_ctx set up an appropriate sub-aggregate
initialization context even when the index is a RANGE_EXPR.

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
trunk and the 10 branch?

gcc/cp/ChangeLog:

PR c++/97328
* constexpr.c (init_subob_ctx): Don't punt if the index is a
RANGE_EXPR, instead build a sub-aggregate initialization context
with no subobject.

gcc/testsuite/ChangeLog:

PR c++/97328
* g++.dg/cpp2a/constexpr-init19.C: New test.
* g++.dg/cpp2a/constexpr-init20.C: New test.
---
   gcc/cp/constexpr.c| 13 +++--
   gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C | 15 +++
   gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C | 15 +++
   3 files changed, 37 insertions(+), 6 deletions(-)
   create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C
   create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C

diff --git a/gcc/cp/constexpr.c b/gcc/cp/constexpr.c
index a118f8a810b..e50a2a220cb 100644
--- a/gcc/cp/constexpr.c
+++ b/gcc/cp/constexpr.c
@@ -3953,11 +3953,6 @@ init_subob_ctx (const constexpr_ctx *ctx,
constexpr_ctx _ctx,
   {
 new_ctx = *ctx;
   -  if (index && TREE_CODE (index) != INTEGER_CST
-  && TREE_CODE (index) != FIELD_DECL)
-/* This won't have an element in the new CONSTRUCTOR.  */
-return;


Hmm, I wonder what this was trying to exclude?  I'd be more comfortable adding
RANGE_EXPR to the allowed index codes.


Ah, it's probably COMPONENT_REF, NOP_EXPR and/or POINTER_PLUS_EXPR.
I missed that cxx_eval_bare_aggregate explicitly checks for such
indexes.

Here's a patch which refines the above check rather than removing it
entirely.  Does it look OK for 10/trunk after testing?


OK.


-- >8 --

Subject: [PATCH] c++: Handle RANGE_EXPR index in init_subob_ctx [PR97328]

In the testcase below, we're ICEing during constexpr evaluation of the
CONSTRUCTOR {.data={{}, [1 ... 7]={}}} of type 'vector'.  The interesting
thing about this CONSTRUCTOR is that it has a RANGE_EXPR index for an
element initializer which doesn't satisfy reduced_constant_expression_p
(because the field 't' is uninitialized).

This is a problem because init_subob_ctx currently punts on setting up a
sub-aggregate initialization context when given a RANGE_EXPR index, so
we later trip over the asserts in verify_ctor_sanity when recursing into
cxx_eval_bare_aggregate on this element initializer.

Fix this by making init_subob_ctx set up an appropriate initialization
context when given a RANGE_EXPR index.

gcc/cp/ChangeLog:

PR c++/97328
* constexpr.c (init_subob_ctx): Don't punt on RANGE_EXPR
indexes, instead build a sub-aggregate initialization context
with no subobject.

gcc/testsuite/ChangeLog:

PR c++/97328
* g++.dg/cpp2a/constexpr-init19.C: New test.
* g++.dg/cpp2a/constexpr-init20.C: New test.
---
  gcc/cp/constexpr.c| 11 +--
  gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C | 15 +++
  gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C | 15 +++
  3 files changed, 39 insertions(+), 2 deletions(-)
  create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C
  create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C

diff --git a/gcc/cp/constexpr.c b/gcc/cp/constexpr.c
index a118f8a810b..cb3c787094c 100644
--- a/gcc/cp/constexpr.c
+++ b/gcc/cp/constexpr.c
@@ -3954,7 +3954,8 @@ init_subob_ctx (const constexpr_ctx *ctx, constexpr_ctx 
_ctx,
new_ctx = *ctx;
  
if (index && TREE_CODE (index) != INTEGER_CST

-  && TREE_CODE (index) != FIELD_DECL)
+  && TREE_CODE (index) != FIELD_DECL
+  && TREE_CODE (index) != RANGE_EXPR)
  /* This won't have an element in the new CONSTRUCTOR.  */
  return;
  
@@ -3967,7 +3968,13 @@ init_subob_ctx (const constexpr_ctx *ctx, constexpr_ctx _ctx,

   update object to refer to the subobject and ctor to refer to
   the (newly created) sub-initializer.  */
if (ctx->object)
-new_ctx.object = build_ctor_subob_ref (index, type, ctx->object);
+{
+  if (index == NULL_TREE || TREE_CODE (index) == RANGE_EXPR)
+   /* There's no well-defined subobject 

Re: PATCH [DR2303][PR97453]

2020-10-21 Thread Jason Merrill via Gcc-patches

On 10/21/20 6:32 AM, kamlesh kumar wrote:

gcc/cp/ChangeLog
---

2020-10-21  Kamlesh Kumar  

PR c++/97453
* pt.c (get_template_base): Implement DR2303,
Consider closest base while template
deduction when base of base also matches.

gcc/testsuite/ChangeLog
--

2020-10-21  Kamlesh Kumar  

* g++.dg/Drs/dr2303.C: New Test

--

As part of this patch I Implemented fix for below defect report in cwg
https://wg21.cmeerw.net/cwg/issue2303 .


Thanks!

Please see https://gcc.gnu.org/contribute.html for guidance on email 
subject lines; for this patch I'd think something like


[PATCH] c++: Implement DR2303 [PR97453]

Also, your patch was corrupted by word wrap; the easiest way to avoid 
that is probably to attach the file rather than copy it into the message.



Reg tested on x86_64 and did not found any failure.
Patch summary: Remove base of base from list of bases

created a hash_set from list of bases and then iterate over each
element of hash_set and find its  list of bases and remove this from
hash_set if present.
and finally, deduction succeeds if in hash_set remains only single
element or it's empty.
otherwise deduction is ambiguous.


Instead of building a hash table, would it work to handle ambiguity by 
checking whether one of the classes is a base of the other?



---
diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c
index dc664ec3798..7adf461e108 100644
--- a/gcc/cp/pt.c
+++ b/gcc/cp/pt.c
@@ -22643,8 +22643,9 @@ static enum template_base_result
  get_template_base (tree tparms, tree targs, tree parm, tree arg,
  bool explain_p, tree *result)
  {
-  tree rval = NULL_TREE;
+  *result = NULL_TREE;
tree binfo;
+  hash_set binfo_set;

gcc_assert (RECORD_OR_UNION_CODE_P (TREE_CODE (arg)));

@@ -22659,31 +22660,51 @@ get_template_base (tree tparms, tree targs,
tree parm, tree arg,
/* Walk in inheritance graph order.  The search order is not
   important, and this avoids multiple walks of virtual bases.  */
for (binfo = TREE_CHAIN (binfo); binfo; binfo = TREE_CHAIN (binfo))
-{
-  tree r = try_class_unification (tparms, targs, parm,
-   BINFO_TYPE (binfo), explain_p);
-
-  if (r)
- {
-   /* If there is more than one satisfactory baseclass, then:
-
-[temp.deduct.call]
+ {
+   tree r = try_class_unification (tparms, targs, parm,
+   BINFO_TYPE (binfo), explain_p);
+   if (r)
+ {
+   binfo_set.add(r);
+ }
+ }

-   If they yield more than one possible deduced A, the type
-   deduction fails.
+  /* If there is more than one satisfactory baseclass, then:
+ [temp.deduct.call]
+  If they yield more than one possible deduced A, the type
+  deduction fails.
+ However, if there is a class C that is a (direct or indirect)
base class of
+ D and derived (directly or indirectly) from a class B and that would be a
+ valid deduced A, the deduced A cannot be B or pointer to B,
respectively.  */
+  for (hash_set::iterator it = binfo_set.begin();
+it != binfo_set.end(); ++it)
+{
+  binfo = TYPE_BINFO (*it);
+  for (binfo = TREE_CHAIN (binfo); binfo; binfo = TREE_CHAIN (binfo))
+{
+  tree r = try_class_unification (tparms, targs, parm,
+  BINFO_TYPE (binfo), explain_p);
+  if (r && binfo_set.contains(r))
+{
+  binfo_set.remove(r);
+}
+}
+}

-  applies.  */
-   if (rval && !same_type_p (r, rval))
- {
-   *result = NULL_TREE;
-   return tbr_ambiguous_baseclass;
- }
+  if (binfo_set.elements() > 1)
+{
+  return tbr_ambiguous_baseclass;
+}

-   rval = r;
- }
+  if (binfo_set.is_empty())
+{
+  return tbr_success;
  }

-  *result = rval;
+  if (binfo_set.elements() == 1)
+{
+  *result = *binfo_set.begin();
+}
return tbr_success;
  }

diff --git a/gcc/testsuite/g++.dg/DRs/dr2303.C
b/gcc/testsuite/g++.dg/DRs/dr2303.C
new file mode 100644
index 000..b4c23332358
--- /dev/null
+++ b/gcc/testsuite/g++.dg/DRs/dr2303.C
@@ -0,0 +1,20 @@
+// DR 2303
+// PR c++/97453
+// { dg-do compile { target c++11 } }
+
+template 
+struct A;
+template <>
+struct A<> {};
+template 
+struct A : A {};
+struct B : A {};
+
+template 
+void f(const A &) {
+  static_assert(sizeof...(T) == 2, "it should duduce to A");
+}
+
+void g() {
+  f(B{});
+}


./kamlesh





Re: [PATCH] c++: Handle RANGE_EXPR indexes in init_subob_ctx [PR97328]

2020-10-21 Thread Patrick Palka via Gcc-patches
On Wed, 21 Oct 2020, Jason Merrill wrote:

> On 10/8/20 4:47 PM, Patrick Palka wrote:
> > In the testcase below, we're ICEing during constexpr evaluation of the
> > CONSTRUCTOR {.data={{}, [1 ... 7]={}}} of type 'vector'.  The apparently
> > unique thing about this CONSTRUCTOR is that it has a RANGE_EXPR index
> > whose corresponding sub-aggregate initializer doesn't satisfy
> > reduced_constant_expression_p (because its field 't' is uninitialized).
> > 
> > This is a problem because init_subob_ctx currently punts if the
> > constructor index is a RANGE_EXPR, so when cxx_eval_bare_aggregate
> > recurses into this sub-aggregate initializer we trip over the
> > same_type_p assert in verify_ctor_sanity.
> > 
> > Fix this by making init_subob_ctx set up an appropriate sub-aggregate
> > initialization context even when the index is a RANGE_EXPR.
> > 
> > Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
> > trunk and the 10 branch?
> > 
> > gcc/cp/ChangeLog:
> > 
> > PR c++/97328
> > * constexpr.c (init_subob_ctx): Don't punt if the index is a
> > RANGE_EXPR, instead build a sub-aggregate initialization context
> > with no subobject.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> > PR c++/97328
> > * g++.dg/cpp2a/constexpr-init19.C: New test.
> > * g++.dg/cpp2a/constexpr-init20.C: New test.
> > ---
> >   gcc/cp/constexpr.c| 13 +++--
> >   gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C | 15 +++
> >   gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C | 15 +++
> >   3 files changed, 37 insertions(+), 6 deletions(-)
> >   create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C
> >   create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C
> > 
> > diff --git a/gcc/cp/constexpr.c b/gcc/cp/constexpr.c
> > index a118f8a810b..e50a2a220cb 100644
> > --- a/gcc/cp/constexpr.c
> > +++ b/gcc/cp/constexpr.c
> > @@ -3953,11 +3953,6 @@ init_subob_ctx (const constexpr_ctx *ctx,
> > constexpr_ctx _ctx,
> >   {
> > new_ctx = *ctx;
> >   -  if (index && TREE_CODE (index) != INTEGER_CST
> > -  && TREE_CODE (index) != FIELD_DECL)
> > -/* This won't have an element in the new CONSTRUCTOR.  */
> > -return;
> 
> Hmm, I wonder what this was trying to exclude?  I'd be more comfortable adding
> RANGE_EXPR to the allowed index codes.

Ah, it's probably COMPONENT_REF, NOP_EXPR and/or POINTER_PLUS_EXPR.
I missed that cxx_eval_bare_aggregate explicitly checks for such
indexes.

Here's a patch which refines the above check rather than removing it
entirely.  Does it look OK for 10/trunk after testing?

-- >8 --

Subject: [PATCH] c++: Handle RANGE_EXPR index in init_subob_ctx [PR97328]

In the testcase below, we're ICEing during constexpr evaluation of the
CONSTRUCTOR {.data={{}, [1 ... 7]={}}} of type 'vector'.  The interesting
thing about this CONSTRUCTOR is that it has a RANGE_EXPR index for an
element initializer which doesn't satisfy reduced_constant_expression_p
(because the field 't' is uninitialized).

This is a problem because init_subob_ctx currently punts on setting up a
sub-aggregate initialization context when given a RANGE_EXPR index, so
we later trip over the asserts in verify_ctor_sanity when recursing into
cxx_eval_bare_aggregate on this element initializer.

Fix this by making init_subob_ctx set up an appropriate initialization
context when given a RANGE_EXPR index.

gcc/cp/ChangeLog:

PR c++/97328
* constexpr.c (init_subob_ctx): Don't punt on RANGE_EXPR
indexes, instead build a sub-aggregate initialization context
with no subobject.

gcc/testsuite/ChangeLog:

PR c++/97328
* g++.dg/cpp2a/constexpr-init19.C: New test.
* g++.dg/cpp2a/constexpr-init20.C: New test.
---
 gcc/cp/constexpr.c| 11 +--
 gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C | 15 +++
 gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C | 15 +++
 3 files changed, 39 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C
 create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C

diff --git a/gcc/cp/constexpr.c b/gcc/cp/constexpr.c
index a118f8a810b..cb3c787094c 100644
--- a/gcc/cp/constexpr.c
+++ b/gcc/cp/constexpr.c
@@ -3954,7 +3954,8 @@ init_subob_ctx (const constexpr_ctx *ctx, constexpr_ctx 
_ctx,
   new_ctx = *ctx;
 
   if (index && TREE_CODE (index) != INTEGER_CST
-  && TREE_CODE (index) != FIELD_DECL)
+  && TREE_CODE (index) != FIELD_DECL
+  && TREE_CODE (index) != RANGE_EXPR)
 /* This won't have an element in the new CONSTRUCTOR.  */
 return;
 
@@ -3967,7 +3968,13 @@ init_subob_ctx (const constexpr_ctx *ctx, constexpr_ctx 
_ctx,
  update object to refer to the subobject and ctor to refer to
  the (newly created) sub-initializer.  */
   if (ctx->object)
-new_ctx.object = build_ctor_subob_ref (index, type, ctx->object);
+{
+  if 

Re: [PATCH] Implement no_stack_protect attribute.

2020-10-21 Thread Nick Desaulniers via Gcc-patches
Thanks for the quick feedback!

On Wed, Oct 21, 2020 at 2:13 PM Jakub Jelinek  wrote:
>
> On Wed, Oct 21, 2020 at 02:04:15PM -0700, Nick Desaulniers via Gcc-patches 
> wrote:
> > Tangentially related question:
> > We're running into a bug related to LTO for the kernel when code
> > compiled with -fno-stack-protector is called from and inlined into
> > code that is compiled with -fstack-protector.  Specifically, stack
> > canaries get checked before they're restored post suspend/resume which
> > leads to spooky bugs.
> >
> > Once we have more fine grain function level attribute to explicitly
> > disable stack protectors on a per function basis, I'm considering
> > making this function attribute a barrier to inlining in LLVM so that
> > callers with stack protectors don't inline callees that explicitly
> > should not have a stack protector and vice versa (more concretely,
> > when they don't match).  I think this would maximize which functions
> > are still covered by stack protectors, and be the most straightforward
> > to implement.
>
> That doesn't make sense to me.
> Stack protector doesn't affect in any way inlined code, the stack protection
> is always solely in the prologue and epilogue of out of line functions.
> So, if the (non-inlined) caller is -fstack-protector and inlined callee
> is -fno-stack-protector, there should be ssp store in the prologue of the
> resulting function and test in the epilogue.

That is the case today, and I'm arguing that leads to bugs in the
Linux kernel when built with LTO.

> The effect will be exactly
> like that if the function wouldn't be inlined.

I don't follow.  If the -fno-stack-protector callee was not inlined,
the caller would have a stack protector, while the callee would not.
I think today there's not a strong enough distinction between the
level of stack protection being specified vs explicit
annotations/flags that stack protectors MUST NOT be inserted.

> Similarly, if the non-inlined caller is -fno-stack-protector and inlined
> callee is -fstack-protector, there will be no stack protection.  This isn't

And I'd argue that now we may have stripped off stack protection in
the pursuit of inlining.  Consider for example the case where that
stack protected callee contained a large alloca; post inlining into a
-fno-stack-protected caller suddenly now it doesn't.  Oops?

Wouldn't it be safer to just prevent inlining, then the callee retains
the stack protector, regardless of caller stack protection?

> exactly the effect one would get without the inlining (as in that case
> the callee would check it), but matches the general behavior that we allow
> inlining functions with -fstack-protector* at all (and only check it in the
> prologue/epilogue, not somewhere in the middle).
>
> Jakub
>


-- 
Thanks,
~Nick Desaulniers


Re: Increase inlining limits for inline functions with builtin_constant_p on parameter

2020-10-21 Thread Jan Hubicka
> On Wed, Oct 21, 2020 at 09:03:03PM +0200, Martin Liška wrote:
> > Maybe a silly question, but isn't the code only an obfuscation?
> > What about doing:
> > 
> > if (__builtin_constant_p(x))
> >   return x < 2 ? 0 : __builtin_clz(x);
> > 
> > which is fully eliminated in a c.031t.ccp1 pass right after einline happens
> > for a call context where 'x' is a constant.
> > 
> > Isn't the right place for fix in Linux kernel?
> 
> Of course it is, see
> https://lore.kernel.org/linux-toolchains/21556974-eea1-ed6a-ea6f-3e97a6eea...@csgroup.eu/T/#m12d2586fe18ed27789c8d67a677783a83b79efa8
Thanks for taking care of this :)

Honza
> 
>   Jakub
> 


Re: Increase inlining limits for inline functions with builtin_constant_p on parameter

2020-10-21 Thread Jan Hubicka
> 
> Hello.
> 
> Maybe a silly question, but isn't the code only an obfuscation?
> What about doing:
> 
> if (__builtin_constant_p(x))
>   return x < 2 ? 0 : __builtin_clz(x);
> 
> which is fully eliminated in a c.031t.ccp1 pass right after einline happens
> for a call context where 'x' is a constant.
> 
> Isn't the right place for fix in Linux kernel?

Yes, Linux kernel should be fixed and that is discussed in the
corresponding PR (there is also correct replacement for that log1
function suggested already).

However there are many additional examples of similar inline functions
doing (ab)use of builtin_constant_p (and not all of them translating to
an existing gcc builtin). So adding a heuristics seems useful (since it
is quite reliable heuristics I would say). Also in the context of
decreasing the -O2 inliner defaults in gcc10 I think number of such
inliner became unlucky and are no longer inlined as expected (a problem
I did not realize until looking into the PR)

Honza
> 
> Martin


Re: [PATCH] Implement no_stack_protect attribute.

2020-10-21 Thread Jakub Jelinek via Gcc-patches
On Wed, Oct 21, 2020 at 02:04:15PM -0700, Nick Desaulniers via Gcc-patches 
wrote:
> Tangentially related question:
> We're running into a bug related to LTO for the kernel when code
> compiled with -fno-stack-protector is called from and inlined into
> code that is compiled with -fstack-protector.  Specifically, stack
> canaries get checked before they're restored post suspend/resume which
> leads to spooky bugs.
> 
> Once we have more fine grain function level attribute to explicitly
> disable stack protectors on a per function basis, I'm considering
> making this function attribute a barrier to inlining in LLVM so that
> callers with stack protectors don't inline callees that explicitly
> should not have a stack protector and vice versa (more concretely,
> when they don't match).  I think this would maximize which functions
> are still covered by stack protectors, and be the most straightforward
> to implement.

That doesn't make sense to me.
Stack protector doesn't affect in any way inlined code, the stack protection
is always solely in the prologue and epilogue of out of line functions.
So, if the (non-inlined) caller is -fstack-protector and inlined callee
is -fno-stack-protector, there should be ssp store in the prologue of the
resulting function and test in the epilogue.  The effect will be exactly
like that if the function wouldn't be inlined.
Similarly, if the non-inlined caller is -fno-stack-protector and inlined
callee is -fstack-protector, there will be no stack protection.  This isn't
exactly the effect one would get without the inlining (as in that case
the callee would check it), but matches the general behavior that we allow
inlining functions with -fstack-protector* at all (and only check it in the
prologue/epilogue, not somewhere in the middle).

Jakub



Re: [PATCH] c++: constexpr evaluation and bare EMPTY_CLASS_EXPR [PR96575]

2020-10-21 Thread Jason Merrill via Gcc-patches

On 10/21/20 12:37 PM, Patrick Palka wrote:

In the testcase below, folding of the initializer for 'ret' inside the
instantiated f::lambda ends up yielding an initializer for which
potential_constant_expression returns false.  This causes finish_function
to mark the lambda as non-constexpr, which ultimately causes us to reject
'f(g)' as a call to a non-constexpr function.

The initializer for 'ret' inside f::lambda, prior to folding, is
the CALL_EXPR

   ::operator() (, ({}, <<< Unknown tree: empty_class_expr >>>;))

where the second argument is a COMPOUND_EXPR whose second operand is an
EMPTY_CLASS_EXPR, formed by build_class_a.  cp_fully_fold_init is able
to only partially fold this initializer, doing away with the COMPOUND_EXPR
to yield

   ::operator() (, <<< Unknown tree: empty_class_expr >>>)

as the final initializer for 'ret'.  This initializer no longer satifies
potential_constant_expression because this predicate returns false when
it sees a bare EMPTY_CLASS_EXPR that's not wrapped in a COMPOUND_EXPR.

(cp_fully_fold_init first tries maybe_constant_value on the original
CALL_EXPR, but constexpr evaluation punts upon seeing
__builtin_is_constant_evaluated, since manifestly_const_eval is false.)

To fix this, it seems to me we could either make cp_fold preserve
the COMPOUND_EXPR trees produced by build_call_a, or we could
modify potential_constant_expression and friends to handle "bare"
EMPTY_CLASS_EXPR trees.  Assuming it's safe to continue folding
away these COMPOUND_EXPRs, the second approach seems cleaner, so this
patch implements the second approach.

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
trunk?

gcc/cp/ChangeLog:

PR c++/96575
* constexpr.c (cxx_eval_constant_expression)
: Remove now-redundant handling of
COMPOUND_EXPR with EMPTY_CLASS_EXPR second operand.


I don't think this is entirely redundant; it still avoids building a new 
CONSTRUCTOR when we don't need to, and controlling memory use in the 
constexpr code has largely been about avoiding extra CONSTRUCTORs.


OK without this hunk.


: Lower it into a CONSTRUCTOR.
(potential_constant_expression_1) : Remove
now-redundant handling of COMPOUND_EXPR with EMPTY_CLASS_EXPR
second operand.
: Return true instead of false.

gcc/testsuite/ChangeLog:

PR c++/96575
* g++.dg/cpp1z/constexpr-96575.C: New test.
---
  gcc/cp/constexpr.c   | 20 
  gcc/testsuite/g++.dg/cpp1z/constexpr-96575.C | 19 +++
  2 files changed, 27 insertions(+), 12 deletions(-)
  create mode 100644 gcc/testsuite/g++.dg/cpp1z/constexpr-96575.C

diff --git a/gcc/cp/constexpr.c b/gcc/cp/constexpr.c
index a118f8a810b..0c13ff4db71 100644
--- a/gcc/cp/constexpr.c
+++ b/gcc/cp/constexpr.c
@@ -6070,13 +6070,11 @@ cxx_eval_constant_expression (const constexpr_ctx *ctx, 
tree t,
  case COMPOUND_EXPR:
{
/* check_return_expr sometimes wraps a TARGET_EXPR in a
-  COMPOUND_EXPR; don't get confused.  Also handle EMPTY_CLASS_EXPR
-  introduced by build_call_a.  */
+  COMPOUND_EXPR; don't get confused.  */
tree op0 = TREE_OPERAND (t, 0);
tree op1 = TREE_OPERAND (t, 1);
STRIP_NOPS (op1);
-   if ((TREE_CODE (op0) == TARGET_EXPR && op1 == TARGET_EXPR_SLOT (op0))
-   || TREE_CODE (op1) == EMPTY_CLASS_EXPR)
+   if (TREE_CODE (op0) == TARGET_EXPR && op1 == TARGET_EXPR_SLOT (op0))
  r = cxx_eval_constant_expression (ctx, op0,
lval, non_constant_p, overflow_p,
jump_target);
@@ -6403,9 +6401,9 @@ cxx_eval_constant_expression (const constexpr_ctx *ctx, 
tree t,
break;
  
  case EMPTY_CLASS_EXPR:

-  /* This is good enough for a function argument that might not get
-used, and they can't do anything with it, so just return it.  */
-  return t;
+  /* Handle EMPTY_CLASS_EXPR produced by build_call_a by lowering
+it to an appropriate CONSTRUCTOR.  */
+  return build_constructor (TREE_TYPE (t), NULL);
  
  case STATEMENT_LIST:

new_ctx = *ctx;
@@ -8186,13 +8184,11 @@ potential_constant_expression_1 (tree t, bool 
want_rval, bool strict, bool now,
  case COMPOUND_EXPR:
{
/* check_return_expr sometimes wraps a TARGET_EXPR in a
-  COMPOUND_EXPR; don't get confused.  Also handle EMPTY_CLASS_EXPR
-  introduced by build_call_a.  */
+  COMPOUND_EXPR; don't get confused.  */
tree op0 = TREE_OPERAND (t, 0);
tree op1 = TREE_OPERAND (t, 1);
STRIP_NOPS (op1);
-   if ((TREE_CODE (op0) == TARGET_EXPR && op1 == TARGET_EXPR_SLOT (op0))
-   || TREE_CODE (op1) == EMPTY_CLASS_EXPR)
+   if (TREE_CODE (op0) == TARGET_EXPR && op1 == TARGET_EXPR_SLOT (op0))
  return RECUR (op0, want_rval);
else
  goto binary;
@@ 

Re: [PATCH] Implement no_stack_protect attribute.

2020-10-21 Thread Nick Desaulniers via Gcc-patches
On Tue, Oct 20, 2020 at 5:19 AM Richard Biener
 wrote:
>
> On Tue, Oct 20, 2020 at 1:24 PM Martin Liška  wrote:
> >
> > PING^5
>
> So can we use the same identifier as clang here as Nick
> requests?  Thus, OK with re-naming everything alongside
> no_stack_protector.  It isn't really the opposite of the
> stack_protect attribute since that only protects when
> -fstack-protector-explicit is enabled.

I'll be happy to help test and review with an updated/rebased patch.

Tangentially related question:
We're running into a bug related to LTO for the kernel when code
compiled with -fno-stack-protector is called from and inlined into
code that is compiled with -fstack-protector.  Specifically, stack
canaries get checked before they're restored post suspend/resume which
leads to spooky bugs.

Once we have more fine grain function level attribute to explicitly
disable stack protectors on a per function basis, I'm considering
making this function attribute a barrier to inlining in LLVM so that
callers with stack protectors don't inline callees that explicitly
should not have a stack protector and vice versa (more concretely,
when they don't match).  I think this would maximize which functions
are still covered by stack protectors, and be the most straightforward
to implement.

The next question then is what happens when the callee is marked
__attribute__((always_inline))?  My answer for LLVM currently is
"still disallow inline substitution" which is more surprising than I'd
like, but we already have precedent for "always inline" not meaning
"always."  Warning from the frontend when mixing no_stack_protector
and always_inline is possible if the callers are visible and don't
match, but I don't think that works for cross translation unit calls.

I guess I was curious if others have ideas for solutions to this
particular problem?  Otherwise I plan to implement the above logic in
LLVM.  We'd eventually need matching logic in GCC to support LTO
kernels not having the same bug.

https://reviews.llvm.org/D87956
-- 
Thanks,
~Nick Desaulniers


Re: [PATCH 2/2, rs6000, V2] VSX load/store rightmost element operations

2020-10-21 Thread Segher Boessenkool
On Tue, Oct 20, 2020 at 04:34:46PM -0500, will schmidt wrote:
> This adds support for the VSX load/store rightmost element operations.
> This includes the instructions lxvrbx, lxvrhx, lxvrwx, lxvrdx,
> stxvrbx, stxvrhx, stxvrwx, stxvrdx; And the builtins
> vec_xl_sext() /* vector load sign extend */
> vec_xl_zext() /* vector load zero extend */
> vec_xst_trunc() /* vector store truncate */.

I think this is fine now.  Thanks!  Okay for trunk.


Segher


Re: [PATCH] c++: Handle RANGE_EXPR indexes in init_subob_ctx [PR97328]

2020-10-21 Thread Jason Merrill via Gcc-patches

On 10/8/20 4:47 PM, Patrick Palka wrote:

In the testcase below, we're ICEing during constexpr evaluation of the
CONSTRUCTOR {.data={{}, [1 ... 7]={}}} of type 'vector'.  The apparently
unique thing about this CONSTRUCTOR is that it has a RANGE_EXPR index
whose corresponding sub-aggregate initializer doesn't satisfy
reduced_constant_expression_p (because its field 't' is uninitialized).

This is a problem because init_subob_ctx currently punts if the
constructor index is a RANGE_EXPR, so when cxx_eval_bare_aggregate
recurses into this sub-aggregate initializer we trip over the
same_type_p assert in verify_ctor_sanity.

Fix this by making init_subob_ctx set up an appropriate sub-aggregate
initialization context even when the index is a RANGE_EXPR.

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
trunk and the 10 branch?

gcc/cp/ChangeLog:

PR c++/97328
* constexpr.c (init_subob_ctx): Don't punt if the index is a
RANGE_EXPR, instead build a sub-aggregate initialization context
with no subobject.

gcc/testsuite/ChangeLog:

PR c++/97328
* g++.dg/cpp2a/constexpr-init19.C: New test.
* g++.dg/cpp2a/constexpr-init20.C: New test.
---
  gcc/cp/constexpr.c| 13 +++--
  gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C | 15 +++
  gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C | 15 +++
  3 files changed, 37 insertions(+), 6 deletions(-)
  create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C
  create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C

diff --git a/gcc/cp/constexpr.c b/gcc/cp/constexpr.c
index a118f8a810b..e50a2a220cb 100644
--- a/gcc/cp/constexpr.c
+++ b/gcc/cp/constexpr.c
@@ -3953,11 +3953,6 @@ init_subob_ctx (const constexpr_ctx *ctx, constexpr_ctx 
_ctx,
  {
new_ctx = *ctx;
  
-  if (index && TREE_CODE (index) != INTEGER_CST

-  && TREE_CODE (index) != FIELD_DECL)
-/* This won't have an element in the new CONSTRUCTOR.  */
-return;


Hmm, I wonder what this was trying to exclude?  I'd be more comfortable 
adding RANGE_EXPR to the allowed index codes.



tree type = initialized_type (value);
if (!AGGREGATE_TYPE_P (type) && !VECTOR_TYPE_P (type))
  /* A non-aggregate member doesn't get its own CONSTRUCTOR.  */
@@ -3967,7 +3962,13 @@ init_subob_ctx (const constexpr_ctx *ctx, constexpr_ctx 
_ctx,
   update object to refer to the subobject and ctor to refer to
   the (newly created) sub-initializer.  */
if (ctx->object)
-new_ctx.object = build_ctor_subob_ref (index, type, ctx->object);
+{
+  if (index == NULL_TREE || TREE_CODE (index) == RANGE_EXPR)
+   /* There's no well-defined subobject for this index.  */
+   new_ctx.object = NULL_TREE;
+  else
+   new_ctx.object = build_ctor_subob_ref (index, type, ctx->object);
+}
tree elt = build_constructor (type, NULL);
CONSTRUCTOR_NO_CLEARING (elt) = true;
new_ctx.ctor = elt;
diff --git a/gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C 
b/gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C
new file mode 100644
index 000..d354c5ad609
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C
@@ -0,0 +1,15 @@
+// PR c++/97328
+// { dg-do compile { target c++20 } }
+
+struct vector {
+  struct storage {
+int t;
+constexpr storage() {}
+  } data[8];
+};
+
+constexpr auto foo() {
+  vector i;
+  return i;
+}
+auto val = foo();
diff --git a/gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C 
b/gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C
new file mode 100644
index 000..1a6ed8d86dd
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C
@@ -0,0 +1,15 @@
+// PR c++/97328
+// { dg-do compile { target c++20 } }
+
+struct vector {
+  union storage {
+int t;
+constexpr storage() {}
+  } data[8];
+};
+
+constexpr auto foo() {
+  vector i;
+  return i;
+}
+auto val = foo();





Re: [PATCH] c++: Check DECL_TEMPLATE_PARM_P in duplicate_decls [PR97511]

2020-10-21 Thread Jason Merrill via Gcc-patches

On 10/21/20 4:35 PM, Patrick Palka wrote:

This makes duplicate_decls differentiate a TYPE_DECL for an alias
template from a TYPE_DECL for one of its template parameters.  The
recently added assert in template_parm_to_arg revealed this latent issue
because merging of the two TYPE_DECLs cleared the DECL_TEMPLATE_PARM_P
flag.

With this patch, we now also correctly diagnose the name shadowing in
the below testcase (as required by [temp.local]/6).

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK to
commit?


OK.


gcc/cp/ChangeLog:

PR c++/97511
* decl.c (duplicate_decls): Return NULL_TREE if
DECL_TEMPLATE_PARM_P differ.

gcc/testsuite/ChangeLog:

PR c++/97511
* g++.dg/template/shadow3.C: New test.
---
  gcc/cp/decl.c   | 3 +++
  gcc/testsuite/g++.dg/template/shadow3.C | 4 
  2 files changed, 7 insertions(+)
  create mode 100644 gcc/testsuite/g++.dg/template/shadow3.C

diff --git a/gcc/cp/decl.c b/gcc/cp/decl.c
index 5f370e60b4e..2de4e1657fb 100644
--- a/gcc/cp/decl.c
+++ b/gcc/cp/decl.c
@@ -2002,6 +2002,9 @@ duplicate_decls (tree newdecl, tree olddecl, bool hiding, 
bool was_hidden)
  || DECL_IMPLICIT_TYPEDEF_P (newdecl)))
  return NULL_TREE;
  
+  if (DECL_TEMPLATE_PARM_P (olddecl) != DECL_TEMPLATE_PARM_P (newdecl))

+return NULL_TREE;
+
if (!validate_constexpr_redeclaration (olddecl, newdecl))
  return error_mark_node;
  
diff --git a/gcc/testsuite/g++.dg/template/shadow3.C b/gcc/testsuite/g++.dg/template/shadow3.C

new file mode 100644
index 000..a5f256384ac
--- /dev/null
+++ b/gcc/testsuite/g++.dg/template/shadow3.C
@@ -0,0 +1,4 @@
+// PR c++/97511
+// { dg-do compile { target c++11 } }
+
+template  using Z = Z; // { dg-error "shadow|declaration" }





[PATCH] c++: Check DECL_TEMPLATE_PARM_P in duplicate_decls [PR97511]

2020-10-21 Thread Patrick Palka via Gcc-patches
This makes duplicate_decls differentiate a TYPE_DECL for an alias
template from a TYPE_DECL for one of its template parameters.  The
recently added assert in template_parm_to_arg revealed this latent issue
because merging of the two TYPE_DECLs cleared the DECL_TEMPLATE_PARM_P
flag.

With this patch, we now also correctly diagnose the name shadowing in
the below testcase (as required by [temp.local]/6).

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK to
commit?

gcc/cp/ChangeLog:

PR c++/97511
* decl.c (duplicate_decls): Return NULL_TREE if
DECL_TEMPLATE_PARM_P differ.

gcc/testsuite/ChangeLog:

PR c++/97511
* g++.dg/template/shadow3.C: New test.
---
 gcc/cp/decl.c   | 3 +++
 gcc/testsuite/g++.dg/template/shadow3.C | 4 
 2 files changed, 7 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/template/shadow3.C

diff --git a/gcc/cp/decl.c b/gcc/cp/decl.c
index 5f370e60b4e..2de4e1657fb 100644
--- a/gcc/cp/decl.c
+++ b/gcc/cp/decl.c
@@ -2002,6 +2002,9 @@ duplicate_decls (tree newdecl, tree olddecl, bool hiding, 
bool was_hidden)
  || DECL_IMPLICIT_TYPEDEF_P (newdecl)))
 return NULL_TREE;
 
+  if (DECL_TEMPLATE_PARM_P (olddecl) != DECL_TEMPLATE_PARM_P (newdecl))
+return NULL_TREE;
+
   if (!validate_constexpr_redeclaration (olddecl, newdecl))
 return error_mark_node;
 
diff --git a/gcc/testsuite/g++.dg/template/shadow3.C 
b/gcc/testsuite/g++.dg/template/shadow3.C
new file mode 100644
index 000..a5f256384ac
--- /dev/null
+++ b/gcc/testsuite/g++.dg/template/shadow3.C
@@ -0,0 +1,4 @@
+// PR c++/97511
+// { dg-do compile { target c++11 } }
+
+template  using Z = Z; // { dg-error "shadow|declaration" }
-- 
2.29.0.rc0



Re: [PATCH 2/8] [RS6000] rs6000_rtx_costs for AND

2020-10-21 Thread Segher Boessenkool
On Wed, Oct 21, 2020 at 01:27:42PM +1030, Alan Modra wrote:
> On Tue, Oct 20, 2020 at 01:55:56PM -0500, Segher Boessenkool wrote:
> > On Thu, Oct 08, 2020 at 09:27:54AM +1030, Alan Modra wrote:
> > > The existing "case AND" in this function is not sufficient for
> > > optabs.c:avoid_expensive_constant usage, where the AND is passed in
> > > outer_code.  We'd like to cost AND of rs6000_is_valid_and_mask
> > > or rs6000_is_valid_2insn_and variety there, so that those masks aren't
> > > seen as expensive (ie. better to load to a reg then AND).
> > > 
> > >   * config/rs6000/rs6000.c (rs6000_rtx_costs): Combine CONST_INT
> > >   AND handling with IOR/XOR.  Move costing for AND with
> > >   rs6000_is_valid_and_mask or rs6000_is_valid_2insn_and to
> > >   CONST_INT.
> > 
> > Sorry this took so long to review :-(
> > 
> > On 64-bit BE this leads to *bigger* code, and closer observation shows
> > that some common sequences degrade on all configs.  This seems to mostly
> > be about "andc" (and its dot form).  It wasn't costed properly before,
> > but after your patch, a single instruction is replaced by three.
> > 
> > Could you look into this?
> 
> ~/build/gcc-alan/gcc$ for z in *.o; do if test `objdump -dr $z | grep andc | 
> wc -l` != `objdump -dr ../../gcc/gcc/$z | grep andc | wc -l`; then echo $z; 
> fi; done
> gimplify.o
> insn-emit.o
> insn-opinit.o
> insn-recog.o
> rs6000-string.o
> 
> All of these are exactly the case I talked about in
> https://gcc.gnu.org/pipermail/gcc-patches/2020-September/553919.html

For a kernel build (my testcase) it happens more often.

> "Sometimes correct insn cost leads to unexpected results.  For
> example:
> 
> extern unsigned bar (void);
> unsigned
> f1 (unsigned a)
> {
>   if ((a & 0x01000200) == 0x01000200)
> return bar ();
>   return 0;
> }
> 
> emits for a & 0x01000200
>  (set (reg) (and (reg) (const_int 0x01000200)))
> at expand time (two rlwinm insns) rather than the older
>  (set (reg) (const_int 0x01000200))
>  (set (reg) (and (reg) (reg)))

And that is bad.  Why on earth does expand "optimise" this?  It should
not, it hinders various *real* optimisations!

> which is three insns.  However, since 0x01000200 is needed later the
> older code after optimisation is smaller."
> 
> Things have changed slightly since I wrote the above, with the two
> rlwinm insns being emitted at expand time, so you see
>  (set (reg) (and (reg) (const_int 0xff0003ff)))
>  (set (reg) (and (reg) (const_int 0x01fffe00)))

It has done that for many years?

> but of course that doesn't change anything regarding the cost of
> "a & 0x01000200".

Yeah.  But the problem is that cost that are "better", "closer to
reality", sometimes result in worse results :-(

Anyway:

+  || (outer_code == AND
+  && rs6000_is_valid_2insn_and (x, mode)))
{
  *total = COSTS_N_INSNS (1);
  return true;

It should return COSTS_N_INSNS (2) for that?

Testing with that now.


Segher


[committed] libstdc++: Simplify std::shared_ptr construction from std::weak_ptr

2020-10-21 Thread Jonathan Wakely via Gcc-patches
The _M_add_ref_lock() and _M_add_ref_lock_nothrow() members of
_Sp_counted_base are very similar, except that the former throws an
exception when the use count is zero and the latter returns false. The
former (and its callers) can be implemented in terms of the latter.
This results in a small reduction in code size, because throwing an
exception now only happens in one place.

libstdc++-v3/ChangeLog:

* include/bits/shared_ptr.h (shared_ptr(const weak_ptr&, nothrow_t)):
Add noexcept.
* include/bits/shared_ptr_base.h (_Sp_counted_base::_M_add_ref_lock):
Remove specializations and just call _M_add_ref_lock_nothrow.
(__shared_count, __shared_ptr): Use nullptr for null pointer
constants.
(__shared_count(const __weak_count&)): Use _M_add_ref_lock_nothrow
instead of _M_add_ref_lock.
(__shared_count(const __weak_count&, nothrow_t)): Add noexcept.
(__shared_ptr::operator bool()): Add noexcept.
(__shared_ptr(const __weak_ptr&, nothrow_t)): Add noexcept.

Tested powerpc64le-linux. Committed to trunk.

commit 945151b7f14c5d105abd8117f208ae9e3db91fb4
Author: Jonathan Wakely 
Date:   Wed Oct 21 21:13:41 2020

libstdc++: Simplify std::shared_ptr construction from std::weak_ptr

The _M_add_ref_lock() and _M_add_ref_lock_nothrow() members of
_Sp_counted_base are very similar, except that the former throws an
exception when the use count is zero and the latter returns false. The
former (and its callers) can be implemented in terms of the latter.
This results in a small reduction in code size, because throwing an
exception now only happens in one place.

libstdc++-v3/ChangeLog:

* include/bits/shared_ptr.h (shared_ptr(const weak_ptr&, 
nothrow_t)):
Add noexcept.
* include/bits/shared_ptr_base.h 
(_Sp_counted_base::_M_add_ref_lock):
Remove specializations and just call _M_add_ref_lock_nothrow.
(__shared_count, __shared_ptr): Use nullptr for null pointer
constants.
(__shared_count(const __weak_count&)): Use _M_add_ref_lock_nothrow
instead of _M_add_ref_lock.
(__shared_count(const __weak_count&, nothrow_t)): Add noexcept.
(__shared_ptr::operator bool()): Add noexcept.
(__shared_ptr(const __weak_ptr&, nothrow_t)): Add noexcept.

diff --git a/libstdc++-v3/include/bits/shared_ptr.h 
b/libstdc++-v3/include/bits/shared_ptr.h
index 0c393e23132..0bfb525aae7 100644
--- a/libstdc++-v3/include/bits/shared_ptr.h
+++ b/libstdc++-v3/include/bits/shared_ptr.h
@@ -413,7 +413,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
allocate_shared(const _Alloc& __a, _Args&&... __args);
 
   // This constructor is non-standard, it is used by weak_ptr::lock().
-  shared_ptr(const weak_ptr<_Tp>& __r, std::nothrow_t)
+  shared_ptr(const weak_ptr<_Tp>& __r, std::nothrow_t) noexcept
   : __shared_ptr<_Tp>(__r, std::nothrow) { }
 
   friend class weak_ptr<_Tp>;
diff --git a/libstdc++-v3/include/bits/shared_ptr_base.h 
b/libstdc++-v3/include/bits/shared_ptr_base.h
index ff578e66117..ca37f2bebd6 100644
--- a/libstdc++-v3/include/bits/shared_ptr_base.h
+++ b/libstdc++-v3/include/bits/shared_ptr_base.h
@@ -142,10 +142,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   { __gnu_cxx::__atomic_add_dispatch(&_M_use_count, 1); }
 
   void
-  _M_add_ref_lock();
+  _M_add_ref_lock()
+  {
+   if (!_M_add_ref_lock_nothrow())
+ __throw_bad_weak_ptr();
+  }
 
   bool
-  _M_add_ref_lock_nothrow();
+  _M_add_ref_lock_nothrow() noexcept;
 
   void
   _M_release() noexcept
@@ -214,48 +218,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   _Atomic_word  _M_weak_count;// #weak + (#shared != 0)
 };
 
-  template<>
-inline void
-_Sp_counted_base<_S_single>::
-_M_add_ref_lock()
-{
-  if (_M_use_count == 0)
-   __throw_bad_weak_ptr();
-  ++_M_use_count;
-}
-
-  template<>
-inline void
-_Sp_counted_base<_S_mutex>::
-_M_add_ref_lock()
-{
-  __gnu_cxx::__scoped_lock sentry(*this);
-  if (__gnu_cxx::__exchange_and_add_dispatch(&_M_use_count, 1) == 0)
-   {
- _M_use_count = 0;
- __throw_bad_weak_ptr();
-   }
-}
-
-  template<>
-inline void
-_Sp_counted_base<_S_atomic>::
-_M_add_ref_lock()
-{
-  // Perform lock-free add-if-not-zero operation.
-  _Atomic_word __count = _M_get_use_count();
-  do
-   {
- if (__count == 0)
-   __throw_bad_weak_ptr();
- // Replace the current counter value with the old value + 1, as
- // long as it's not changed meanwhile.
-   }
-  while (!__atomic_compare_exchange_n(&_M_use_count, &__count, __count + 1,
- true, __ATOMIC_ACQ_REL,
- __ATOMIC_RELAXED));
-}
-
   template<>
 inline bool
 

Re: [PATCH] libstdc++: Add c++2a

2020-10-21 Thread Thomas Rodgers



> On Oct 21, 2020, at 10:34 AM, Jonathan Wakely  wrote:
> 
> On 21/10/20 09:53 -0700, Thomas Rodgers wrote:
>> From: Thomas Rodgers 
>> 
>> libstdc++/Changelog:
>>  libstdc++-v3/doc/doxygen/user.cfg.in (INPUT): Add new header.
>>  libstdc++-v3/include/Makefile.am (std_headers): Add new header.
>>  libstdc++-v3/include/Makefile.in: Regenerate.
>>  libstdc++-v3/include/precompiled/stdc++.h: Include new header.
>>  libstdc++-v3/include/std/streambuf
>>   (__detail::__streambuf_core_access): Define.
>>   (basic_streambuf): Befriend __detail::__streambuf_core_access.
> 
> This file is no longer part of the commit, so the server will reject
> this changelog. Please ensure the changelog is accurate (the
> gcc-verify alias created by contrib/gcc-git-customization.sh can do
> that) and push, thanks.
> 

This patch is dependent on the changes to  so I can’t push until that 
patch lands.

Re: [PATCH] rs6000: MMA type causes an ICE in ranger pass due to incompatible types

2020-10-21 Thread Peter Bergner via Gcc-patches
On 10/21/20 1:34 PM, Segher Boessenkool wrote:
>> The following patch from Andrew and richi fixes the ICE on Martin's test
>> case and passes bootstrap and regtesting on powerpc64le-linux.
>> Ok for trunk?
> 
> Yes, okay for trunk.  Thanks!

Ok, pushed to trunk.  Thanks!



>> Since the ranger code that triggered this doesn't seem to be in GCC 10,
>> I assume we do not want to backport this this change?
> 
> No, please do, in a week or so, it is a pretty serious problem that we
> could just asa well run into some other way, as far as I can see?

Ok, I'll wait a week and then do the backport and testing.

Peter



Re: Increase inlining limits for inline functions with builtin_constant_p on parameter

2020-10-21 Thread Jakub Jelinek via Gcc-patches
On Wed, Oct 21, 2020 at 09:03:03PM +0200, Martin Liška wrote:
> Maybe a silly question, but isn't the code only an obfuscation?
> What about doing:
> 
> if (__builtin_constant_p(x))
>   return x < 2 ? 0 : __builtin_clz(x);
> 
> which is fully eliminated in a c.031t.ccp1 pass right after einline happens
> for a call context where 'x' is a constant.
> 
> Isn't the right place for fix in Linux kernel?

Of course it is, see
https://lore.kernel.org/linux-toolchains/21556974-eea1-ed6a-ea6f-3e97a6eea...@csgroup.eu/T/#m12d2586fe18ed27789c8d67a677783a83b79efa8

Jakub



Re: Increase inlining limits for inline functions with builtin_constant_p on parameter

2020-10-21 Thread Martin Liška

On 10/21/20 5:11 PM, Jan Hubicka wrote:

return ( __builtin_constant_p((size) - 1) ? ( __builtin_constant_p((size) - 1)
? ( ((size) - 1) < 2 ? 0 : ((size) - 1) & (1ULL << 63) ? 63 : ((size) - 1) &
(1ULL << 62) ? 62 : ((size) - 1) & (1ULL << 61) ? 61 : ((size) - 1) & (1ULL <<
60) ? 60 : ((size) - 1) & (1ULL << 59) ? 59 : ((size) - 1) & (1ULL << 58) ? 58
: ((size) - 1) & (1ULL << 57) ? 57 : ((size) - 1) & (1ULL << 56) ? 56 : ((size)
- 1) & (1ULL << 55) ? 55 : ((size) - 1) & (1ULL << 54) ? 54 : ((size) - 1) &
(1ULL << 53) ? 53 : ((size) - 1) & (1ULL << 52) ? 52 : ((size) - 1) & (1ULL <<
51) ? 51 : ((size) - 1) & (1ULL << 50) ? 50 : ((size) - 1) & (1ULL << 49) ? 49
: ((size) - 1) & (1ULL << 48) ? 48 : ((size) - 1) & (1ULL << 47) ? 47 : ((size)
- 1) & (1ULL << 46) ? 46 : ((size) - 1) & (1ULL << 45) ? 45 : ((size) - 1) &
(1ULL << 44) ? 44 : ((size) - 1) & (1ULL << 43) ? 43 : ((size) - 1) & (1ULL <<
42) ? 42 : ((size) - 1) & (1ULL << 41) ? 41 : ((size) - 1) & (1ULL << 40) ? 40
: ((size) - 1) & (1ULL << 39) ? 39 : ((size) - 1) & (1ULL << 38) ? 38 : ((size)
- 1) & (1ULL << 37) ? 37 : ((size) - 1) & (1ULL << 36) ? 36 : ((size) - 1) &
(1ULL << 35) ? 35 : ((size) - 1) & (1ULL << 34) ? 34 : ((size) - 1) & (1ULL <<
33) ? 33 : ((size) - 1) & (1ULL << 32) ? 32 : ((size) - 1) & (1ULL << 31) ? 31
: ((size) - 1) & (1ULL << 30) ? 30 : ((size) - 1) & (1ULL << 29) ? 29 : ((size)
- 1) & (1ULL << 28) ? 28 : ((size) - 1) & (1ULL << 27) ? 27 : ((size) - 1) &
(1ULL << 26) ? 26 : ((size) - 1) & (1ULL << 25) ? 25 : ((size) - 1) & (1ULL <<
24) ? 24 : ((size) - 1) & (1ULL << 23) ? 23 : ((size) - 1) & (1ULL << 22) ? 22
: ((size) - 1) & (1ULL << 21) ? 21 : ((size) - 1) & (1ULL << 20) ? 20 : ((size)
- 1) & (1ULL << 19) ? 19 : ((size) - 1) & (1ULL << 18) ? 18 : ((size) - 1) &
(1ULL << 17) ? 17 : ((size) - 1) & (1ULL << 16) ? 16 : ((size) - 1) & (1ULL <<
15) ? 15 : ((size) - 1) & (1ULL << 14) ? 14 : ((size) - 1) & (1ULL << 13) ? 13
: ((size) - 1) & (1ULL << 12) ? 12 : ((size) - 1) & (1ULL << 11) ? 11 : ((size)
- 1) & (1ULL << 10) ? 10 : ((size) - 1) & (1ULL << 9) ? 9 : ((size) - 1) &
(1ULL << 8) ? 8 : ((size) - 1) & (1ULL << 7) ? 7 : ((size) - 1) & (1ULL << 6) ?
6 : ((size) - 1) & (1ULL << 5) ? 5 : ((size) - 1) & (1ULL << 4) ? 4 : ((size) -
1) & (1ULL << 3) ? 3 : ((size) - 1) & (1ULL << 2) ? 2 : 1) : -1) :
(sizeof((size) - 1) <= 4) ? __ilog2_u32((size) - 1) : __ilog2_u64((size) - 1) )
- 12 + 1;


Hello.

Maybe a silly question, but isn't the code only an obfuscation?
What about doing:

if (__builtin_constant_p(x))
  return x < 2 ? 0 : __builtin_clz(x);

which is fully eliminated in a c.031t.ccp1 pass right after einline happens
for a call context where 'x' is a constant.

Isn't the right place for fix in Linux kernel?

Martin


Re: [PATCH] rs6000: MMA type causes an ICE in ranger pass due to incompatible types

2020-10-21 Thread Segher Boessenkool
Hi!

On Wed, Oct 21, 2020 at 01:00:20PM -0500, Peter Bergner wrote:
> PR97360 shows a problem in how we create our PXI and POI modes that cause
> an ICE in the ranger pass.  The problem seems to be that the extra call
> to build_distinct_type_copy() also creates new TYPE_{MIN,MAX}_VALUEs that
> are not compatible/the same as the base type itself.  The simple "fix" is
> to actually remove the unneeded build_distinct_type_copy(), since according
> to richi, the types returned from make_unsigned_type() are already distinct.
> 
> The following patch from Andrew and richi fixes the ICE on Martin's test
> case and passes bootstrap and regtesting on powerpc64le-linux.
> Ok for trunk?

Yes, okay for trunk.  Thanks!

> Since the ranger code that triggered this doesn't seem to be in GCC 10,
> I assume we do not want to backport this this change?

No, please do, in a week or so, it is a pretty serious problem that we
could just asa well run into some other way, as far as I can see?


Segher


> gcc/
>   PR target/97360
>   * config/rs6000/rs6000-call.c (rs6000_init_builtins): Remove call to
>   build_distinct_type_copy().
> 
> gcc/testsuite/
>   PR target/97360
>   * gcc.target/powerpc/pr97360.c: New test.


Re: [PATCH][middle-end][i386][version 3]Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-21 Thread Segher Boessenkool
On Wed, Oct 21, 2020 at 06:09:28PM +0200, Uros Bizjak wrote:
> On Wed, Oct 21, 2020 at 4:45 PM Qing Zhao  wrote:
> > Looks like it’s very complicate to use the st/mm register set correctly, So,
> > I assume that this set of registers might be very hard to be used by the 
> > attacker correctly.
> > Right?
> 
> Correct, but "very hard to be used" depends on how determined the attacker is.

Not only that, but the attacker only needs to get it right once, not for
every function (and not even for every program for that matter).


Segher


[PATCH] rs6000: MMA type causes an ICE in ranger pass due to incompatible types

2020-10-21 Thread Peter Bergner via Gcc-patches
PR97360 shows a problem in how we create our PXI and POI modes that cause
an ICE in the ranger pass.  The problem seems to be that the extra call
to build_distinct_type_copy() also creates new TYPE_{MIN,MAX}_VALUEs that
are not compatible/the same as the base type itself.  The simple "fix" is
to actually remove the unneeded build_distinct_type_copy(), since according
to richi, the types returned from make_unsigned_type() are already distinct.

The following patch from Andrew and richi fixes the ICE on Martin's test
case and passes bootstrap and regtesting on powerpc64le-linux.
Ok for trunk?

Since the ranger code that triggered this doesn't seem to be in GCC 10,
I assume we do not want to backport this this change?

Peter


gcc/
PR target/97360
* config/rs6000/rs6000-call.c (rs6000_init_builtins): Remove call to
build_distinct_type_copy().

gcc/testsuite/
PR target/97360
* gcc.target/powerpc/pr97360.c: New test.

diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c
index 9fdf97bc803..7639aab171d 100644
--- a/gcc/config/rs6000/rs6000-call.c
+++ b/gcc/config/rs6000/rs6000-call.c
@@ -12914,15 +12914,13 @@ rs6000_init_builtins (void)
   /* Vector pair and vector quad support.  */
   if (TARGET_EXTRA_BUILTINS)
 {
-  tree oi_uns_type = make_unsigned_type (256);
-  vector_pair_type_node = build_distinct_type_copy (oi_uns_type);
+  vector_pair_type_node = make_unsigned_type (256);
   SET_TYPE_MODE (vector_pair_type_node, POImode);
   layout_type (vector_pair_type_node);
   lang_hooks.types.register_builtin_type (vector_pair_type_node,
  "__vector_pair");
 
-  tree xi_uns_type = make_unsigned_type (512);
-  vector_quad_type_node = build_distinct_type_copy (xi_uns_type);
+  vector_quad_type_node = make_unsigned_type (512);
   SET_TYPE_MODE (vector_quad_type_node, PXImode);
   layout_type (vector_quad_type_node);
   lang_hooks.types.register_builtin_type (vector_quad_type_node,
diff --git a/gcc/testsuite/gcc.target/powerpc/pr97360.c 
b/gcc/testsuite/gcc.target/powerpc/pr97360.c
new file mode 100644
index 000..2328d28a283
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr97360.c
@@ -0,0 +1,18 @@
+/* PR target/97360 */
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power10" } */
+
+/* Verify we do not ICE on the test below.  */
+
+typedef unsigned char vec_t __attribute__((vector_size(16)));
+
+void
+foo (__vector_quad *dst, __vector_pair *vpair, vec_t *vec)
+{
+  __vector_quad acc = *dst;
+  for (;;)
+{
+  __builtin_mma_xvf64gerpp(, *vpair, vec[7]);
+}
+}


Re: [PATCH] phiopt: Optimize x ? __builtin_clz (x) : 32 in GIMPLE [PR97503]

2020-10-21 Thread Jakub Jelinek via Gcc-patches
On Wed, Oct 21, 2020 at 07:30:46PM +0200, Rainer Orth wrote:
> this broke sparc-sun-solaris2.11 bootstrap
> 
> /vol/gcc/src/hg/master/local/gcc/tree-ssa-phiopt.c: In function 'bool 
> cond_removal_in_popcount_clz_ctz_pattern(basic_block, basic_block, edge, 
> edge, gimple*, tree, tree)':
> /vol/gcc/src/hg/master/local/gcc/tree-ssa-phiopt.c:1858:27: error: variable 
> 'mode' set but not used [-Werror=unused-but-set-variable]
>  1858 |   scalar_int_mode mode = SCALAR_INT_TYPE_MODE (TREE_TYPE 
> (arg));
>   |   ^~~~
> 
> 
> and doubtlessly several other targets that use the defaults.h definition of
> 
> #define CTZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE)  0

Ugh, seems many of those macros do not evaluate the first argument.
This got broken by the change to direct_internal_fn_supported_p, previously
it used mode also in the optab test.

Anyway, I think this should fix it, I'll bootstrap/regtest it tonight:

2020-10-21  Jakub Jelinek  

* tree-ssa-phiopt.c (cond_removal_in_popcount_clz_ctz_pattern):
For CLZ and CTZ tests, use type temporary instead of mode.

--- gcc/tree-ssa-phiopt.c.jj2020-10-21 19:33:12.358042645 +0200
+++ gcc/tree-ssa-phiopt.c   2020-10-21 19:35:18.113213095 +0200
@@ -1842,10 +1842,10 @@ cond_removal_in_popcount_clz_ctz_pattern
 CASE_CFN_CLZ:
   if (INTEGRAL_TYPE_P (TREE_TYPE (arg)))
{
- scalar_int_mode mode = SCALAR_INT_TYPE_MODE (TREE_TYPE (arg));
- if (direct_internal_fn_supported_p (IFN_CLZ, TREE_TYPE (arg),
- OPTIMIZE_FOR_BOTH)
- && CLZ_DEFINED_VALUE_AT_ZERO (mode, val) == 2)
+ tree type = TREE_TYPE (arg);
+ if (direct_internal_fn_supported_p (IFN_CLZ, type, OPTIMIZE_FOR_BOTH)
+ && CLZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (type),
+   val) == 2)
{
  ifn = IFN_CLZ;
  break;
@@ -1855,10 +1855,10 @@ cond_removal_in_popcount_clz_ctz_pattern
 CASE_CFN_CTZ:
   if (INTEGRAL_TYPE_P (TREE_TYPE (arg)))
{
- scalar_int_mode mode = SCALAR_INT_TYPE_MODE (TREE_TYPE (arg));
- if (direct_internal_fn_supported_p (IFN_CTZ, TREE_TYPE (arg),
- OPTIMIZE_FOR_BOTH)
- && CTZ_DEFINED_VALUE_AT_ZERO (mode, val) == 2)
+ tree type = TREE_TYPE (arg);
+ if (direct_internal_fn_supported_p (IFN_CTZ, type, OPTIMIZE_FOR_BOTH)
+ && CTZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (type),
+   val) == 2)
{
  ifn = IFN_CTZ;
  break;


Jakub



Re: [PATCH] arm: Fix multiple inheritance thunks for thumb-1 with -mpure-code

2020-10-21 Thread Richard Earnshaw via Gcc-patches
On 21/10/2020 17:11, Christophe Lyon via Gcc-patches wrote:
> On Wed, 21 Oct 2020 at 18:07, Richard Earnshaw
>  wrote:
>>
>> On 21/10/2020 16:49, Christophe Lyon via Gcc-patches wrote:
>>> On Tue, 20 Oct 2020 at 13:25, Richard Earnshaw
>>>  wrote:

 On 20/10/2020 12:22, Richard Earnshaw wrote:
> On 19/10/2020 17:32, Christophe Lyon via Gcc-patches wrote:
>> On Mon, 19 Oct 2020 at 16:39, Richard Earnshaw
>>  wrote:
>>>
>>> On 12/10/2020 08:59, Christophe Lyon via Gcc-patches wrote:
 On Thu, 8 Oct 2020 at 11:58, Richard Earnshaw
  wrote:
>
> On 08/10/2020 10:07, Christophe Lyon via Gcc-patches wrote:
>> On Tue, 6 Oct 2020 at 18:02, Richard Earnshaw
>>  wrote:
>>>
>>> On 29/09/2020 20:50, Christophe Lyon via Gcc-patches wrote:
 When mi_delta is > 255 and -mpure-code is used, we cannot load 
 delta
 from code memory (like we do without -mpure-code).

 This patch builds the value of mi_delta into r3 with a series of
 movs/adds/lsls.

 We also do some cleanup by not emitting the function address and 
 delta
 via .word directives at the end of the thunk since we don't use 
 them
 with -mpure-code.

 No need for new testcases, this bug was already identified by
 eg. pr46287-3.C

 2020-09-29  Christophe Lyon  

   gcc/
   * config/arm/arm.c (arm_thumb1_mi_thunk): Build mi_delta in 
 r3 and
   do not emit function address and delta when -mpure-code is 
 used.
>>>
>> Hi Richard,
>>
>> Thanks for your comments.
>>
>>> There are some optimizations you can make to this code.
>>>
>>> Firstly, for values between 256 and 510 (inclusive), it would be 
>>> better
>>> to just expand a mov of 255 followed by an add.
>> I now see the splitted for the "Pe" constraint which I hadn't noticed
>> before, so I can write something similar indeed.
>>
>> However, I'm note quite sure to understand the benefit in the split
>> when -mpure-code is NOT used.
>> Consider:
>> int f3_1 (void) { return 510; }
>> int f3_2 (void) { return 511; }
>> Compile with -O2 -mcpu=cortex-m0:
>> f3_1:
>> movsr0, #255
>> lslsr0, r0, #1
>> bx  lr
>> f3_2:
>> ldr r0, .L4
>> bx  lr
>>
>> The splitter makes the code bigger, does it "compensate" for this by
>> not having to load the constant?
>> Actually the constant uses 4 more bytes, which should be taken into
>> account when comparing code size,
>
> Yes, the size of the literal pool entry needs to be taken into 
> account.
>  It might happen that the entry could be shared with another use of 
> that
> literal, but in general that's rare.
>
>> so f3_1 uses 6 bytes, and f3_2 uses 8, so as you say below three
>> thumb1 instructions would be equivalent in size compared to loading
>> from the literal pool. Should the 256-510 range be extended?
>
> It's a bit borderline at three instructions when literal pools are not
> expensive to use, but in thumb1 literal pools tend to be quite small 
> due
> to the limited pc offsets we can use.  I think on balance we probably
> want to use the instruction sequence unless optimizing for size.
>
>>
>>
>>> This is also true for
>>> the literal pools alternative as well, so should be handled before 
>>> all
>>> this.
>> I am not sure what you mean: with -mpure-code, the above sample is 
>> compiled as:
>> f3_1:
>> movsr0, #255
>> lslsr0, r0, #1
>> bx  lr
>> f3_2:
>> movsr0, #1
>> lslsr0, r0, #8
>> addsr0, r0, #255
>> bx  lr
>>
>> so the "return 510" case is already handled as without -mpure-code.
>
> I was thinking specifically of the thunk sequence where you seem to be
> emitting instructions directly rather than generating RTL.  The 
> examples
> you show here are not thunks.
>
 OK thanks for the clarification.

 Here is an updated version, split into 3 patches to hopefully make
 review easier.
 They apply on top of my other mpure-code patches for PR96967 and 
 PR96770:
 

Re: [PATCH] libstdc++: Add c++2a

2020-10-21 Thread Jonathan Wakely via Gcc-patches

On 21/10/20 09:53 -0700, Thomas Rodgers wrote:

From: Thomas Rodgers 

libstdc++/Changelog:
libstdc++-v3/doc/doxygen/user.cfg.in (INPUT): Add new header.
libstdc++-v3/include/Makefile.am (std_headers): Add new header.
libstdc++-v3/include/Makefile.in: Regenerate.
libstdc++-v3/include/precompiled/stdc++.h: Include new header.
libstdc++-v3/include/std/streambuf
   (__detail::__streambuf_core_access): Define.
   (basic_streambuf): Befriend __detail::__streambuf_core_access.


This file is no longer part of the commit, so the server will reject
this changelog. Please ensure the changelog is accurate (the
gcc-verify alias created by contrib/gcc-git-customization.sh can do
that) and push, thanks.



Re: [PATCH] phiopt: Optimize x ? __builtin_clz (x) : 32 in GIMPLE [PR97503]

2020-10-21 Thread Rainer Orth
Hi Jakub,

> While we have at the RTL level noce_try_ifelse_collapse combined with
> simplify_cond_clz_ctz, that optimization doesn't always trigger because
> e.g. on powerpc there is an define_insn to compare a reg against zero and
> copy that register to another one and so we end up with a different pseudo
> in the simplify_cond_clz_ctz test and punt.
>
> For targets that define C?Z_DEFINED_VALUE_AT_ZERO to 2 for certain modes,
> we can optimize it already in phiopt though, just need to ensure that
> we transform the __builtin_c?z* calls into .C?Z ifns because my recent
> VRP changes codified that the builtin calls are always undefined at zero,
> while ifns honor C?Z_DEFINED_VALUE_AT_ZERO equal to 2.
> And, in phiopt we already have popcount handling that does pretty much the
> same thing, except for always using a zero value rather than the one set
> by C?Z_DEFINED_VALUE_AT_ZERO.
>
> So, this patch extends that function to handle not just popcount, but also
> clz and ctz.

this broke sparc-sun-solaris2.11 bootstrap

/vol/gcc/src/hg/master/local/gcc/tree-ssa-phiopt.c: In function 'bool 
cond_removal_in_popcount_clz_ctz_pattern(basic_block, basic_block, edge, edge, 
gimple*, tree, tree)':
/vol/gcc/src/hg/master/local/gcc/tree-ssa-phiopt.c:1858:27: error: variable 
'mode' set but not used [-Werror=unused-but-set-variable]
 1858 |   scalar_int_mode mode = SCALAR_INT_TYPE_MODE (TREE_TYPE (arg));
  |   ^~~~


and doubtlessly several other targets that use the defaults.h definition of

#define CTZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE)  0

Rainer

-- 
-
Rainer Orth, Center for Biotechnology, Bielefeld University


Re: [Ada] Improve precision of Ada.Directories.Modification_Time

2020-10-21 Thread Arnaud Charlet
> >What about instead putting above extern long long __gnat_file_time the
> >following:
> >
> >#if __APPLE__
> >#define st_mtim st_mtimespec
> >#define st_atim st_atimespec
> >#endif
> >
> >To avoid having the two (nested) #if __APPLE__ and keep the code easier

two => three :-)

> >to follow?
> 
> works for me (the test patch was drafted quickly to allow bootstrap
> to continue)
> - I can amend the patch and (re-)test more widely.

OK then with these changes, assuming successfully build/test.

Arno


[PATCH] libstdc++: Add c++2a

2020-10-21 Thread Thomas Rodgers
From: Thomas Rodgers 

libstdc++/Changelog:
libstdc++-v3/doc/doxygen/user.cfg.in (INPUT): Add new header.
libstdc++-v3/include/Makefile.am (std_headers): Add new header.
libstdc++-v3/include/Makefile.in: Regenerate.
libstdc++-v3/include/precompiled/stdc++.h: Include new header.
libstdc++-v3/include/std/streambuf
(__detail::__streambuf_core_access): Define.
(basic_streambuf): Befriend __detail::__streambuf_core_access.
libstdc++-v3/include/std/syncstream: New header.
libstdc++-v3/include/std/version: Add __cpp_lib_syncbuf:
libstdc++-v3/testsuite/27_io/basic_syncbuf/1.cc: New test.
libstdc++-v3/testsuite/27_io/basic_syncbuf/2.cc: Likewise.
libstdc++-v3/testsuite/27_io/basic_syncbuf/basic_ops/1.cc:
Likewise.
libstdc++-v3/testsuite/27_io/basic_syncbuf/requirements/types.cc:
Likewise.
libstdc++-v3/testsuite/27_io/basic_syncbuf/sync_ops/1.cc:
Likewise.
libstdc++-v3/testsuite/27_io/basic_syncstream/1.cc: Likewise.
libstdc++-v3/testsuite/27_io/basic_syncstream/2.cc: Likewise.
libstdc++-v3/testsuite/27_io/basic_syncstream/basic_ops/1.cc:
Likewise.
libstdc++-v3/testsuite/27_io/basic_syncstream/requirements/types.cc:
Likewise.

---
 libstdc++-v3/doc/doxygen/user.cfg.in  |   1 +
 libstdc++-v3/include/Makefile.am  |   1 +
 libstdc++-v3/include/Makefile.in  |   1 +
 libstdc++-v3/include/precompiled/stdc++.h |   2 +-
 libstdc++-v3/include/std/syncstream   | 279 ++
 libstdc++-v3/include/std/version  |   4 +
 .../testsuite/27_io/basic_syncbuf/1.cc|  28 ++
 .../testsuite/27_io/basic_syncbuf/2.cc|  27 ++
 .../27_io/basic_syncbuf/basic_ops/1.cc| 138 +
 .../27_io/basic_syncbuf/requirements/types.cc |  42 +++
 .../27_io/basic_syncbuf/sync_ops/1.cc | 130 
 .../testsuite/27_io/basic_syncstream/1.cc |  28 ++
 .../testsuite/27_io/basic_syncstream/2.cc |  27 ++
 .../27_io/basic_syncstream/basic_ops/1.cc | 135 +
 .../basic_syncstream/requirements/types.cc|  43 +++
 15 files changed, 885 insertions(+), 1 deletion(-)
 create mode 100644 libstdc++-v3/include/std/syncstream
 create mode 100644 libstdc++-v3/testsuite/27_io/basic_syncbuf/1.cc
 create mode 100644 libstdc++-v3/testsuite/27_io/basic_syncbuf/2.cc
 create mode 100644 libstdc++-v3/testsuite/27_io/basic_syncbuf/basic_ops/1.cc
 create mode 100644 
libstdc++-v3/testsuite/27_io/basic_syncbuf/requirements/types.cc
 create mode 100644 libstdc++-v3/testsuite/27_io/basic_syncbuf/sync_ops/1.cc
 create mode 100644 libstdc++-v3/testsuite/27_io/basic_syncstream/1.cc
 create mode 100644 libstdc++-v3/testsuite/27_io/basic_syncstream/2.cc
 create mode 100644 libstdc++-v3/testsuite/27_io/basic_syncstream/basic_ops/1.cc
 create mode 100644 
libstdc++-v3/testsuite/27_io/basic_syncstream/requirements/types.cc

diff --git a/libstdc++-v3/doc/doxygen/user.cfg.in 
b/libstdc++-v3/doc/doxygen/user.cfg.in
index 9b49a15d31b..320f6dea688 100644
--- a/libstdc++-v3/doc/doxygen/user.cfg.in
+++ b/libstdc++-v3/doc/doxygen/user.cfg.in
@@ -897,6 +897,7 @@ INPUT  = @srcdir@/doc/doxygen/doxygroups.cc 
\
  include/streambuf \
  include/string \
  include/string_view \
+ include/syncstream \
  include/system_error \
  include/thread \
  include/tuple \
diff --git a/libstdc++-v3/include/Makefile.am b/libstdc++-v3/include/Makefile.am
index 28d273924ee..61aaff7a2f4 100644
--- a/libstdc++-v3/include/Makefile.am
+++ b/libstdc++-v3/include/Makefile.am
@@ -73,6 +73,7 @@ std_headers = \
${std_srcdir}/shared_mutex \
${std_srcdir}/span \
${std_srcdir}/sstream \
+   ${std_srcdir}/syncstream \
${std_srcdir}/stack \
${std_srcdir}/stdexcept \
${std_srcdir}/stop_token \
diff --git a/libstdc++-v3/include/precompiled/stdc++.h 
b/libstdc++-v3/include/precompiled/stdc++.h
index 7518a98c25a..8899c323a28 100644
--- a/libstdc++-v3/include/precompiled/stdc++.h
+++ b/libstdc++-v3/include/precompiled/stdc++.h
@@ -141,6 +141,6 @@
 #include 
 #include 
 #include 
-// #include 
+#include 
 #include 
 #endif
diff --git a/libstdc++-v3/include/std/syncstream 
b/libstdc++-v3/include/std/syncstream
new file mode 100644
index 000..3f78cef1d8d
--- /dev/null
+++ b/libstdc++-v3/include/std/syncstream
@@ -0,0 +1,279 @@
+//  -*- C++ -*-
+
+// Copyright (C) 2020 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This 

Re: [PATCH][middle-end][i386][version 3]Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-21 Thread Qing Zhao via Gcc-patches



> On Oct 21, 2020, at 11:09 AM, Uros Bizjak  wrote:
> 
> On Wed, Oct 21, 2020 at 4:45 PM Qing Zhao  wrote:
>> 
>> 
>> 
>> -- q --
>> The CPU shall be in x87 mode upon entry to a function. Therefore,
>> every function that uses the MMX registers is required to issue an
>> emms or femms instruction after using MMX registers, before returning
>> or calling another function.
>> -- /q --
>> 
>> (The above requirement slightly contradicts its own ABI, since we have
>> 3 MMX argument registers and MMX return register, so the CPU obviously
>> can't be in x87 mode at all function boundaries).
>> 
>> So, assuming that the first sentence is not deliberately vague w.r.t
>> function exit, emms should not be needed. However, we are dealing with
>> x87 stack registers that have their own set of peculiarities. It is
>> not possible to load a random register in the way you show.  Also,
>> stack should be either empty or one (two in case of complex value
>> return) levels deep at the function return. I think you want a series
>> of 8 or 7(6) fldz insns, followed by a series of fstp insn to clear
>> the stack and mark stack slots empty.
>> 
>> 
>> Something like this:
>> 
>> --cut here--
>> long double
>> __attribute__ ((noinline))
>> test (long double a, long double b)
>> {
>> long double r = a + b;
>> 
>> asm volatile ("fldz;\
>>   fldz;\
>>   fldz;\
>>   fldz;\
>>   fldz;\
>>   fldz;\
>>   fldz;\
>>   fstp %%st(0);\
>>   fstp %%st(0);\
>>   fstp %%st(0);\
>>   fstp %%st(0);\
>>   fstp %%st(0);\
>>   fstp %%st(0);\
>>   fstp %%st(0)" : : "X"(r));
>> return r;
>> }
>> 
>> int
>> main ()
>> {
>> long double a = 1.1, b = 1.2;
>> 
>> long double c = test (a, b);
>> 
>> printf ("%Lf\n", c);
>> 
>> return 0;
>> }
>> --cut here—
>> 
>> 
>> 
>> Okay, so,
>> 
>> 1. First compute how many st registers need to be zeroed,  num_of_zeroed_st
>> 2. Then issue (8 - num_of_zeroed_st) fldz to push 0 to the stack to clear 
>> all the dead stack slots;
>> 3. Then issue (8 - num_of_zeroed_st) fstp %st(0) to pop the stack and empty 
>> the stack.
>> 
>> Is the above understanding correctly?
> 
> Yes.
> 
>> Another thought is:
>> 
>> Looks like it’s very complicate to use the st/mm register set correctly, So,
>> I assume that this set of registers might be very hard to be used by the 
>> attacker correctly.
>> Right?
> 
> Correct, but "very hard to be used" depends on how determined the attacker is.

Okay, I see.
Then I will clear the st registers per the above algorithm you suggested.

Thanks a lot for the help.

Qing
> 
> Uros.



Re: [PATCH][middle-end][i386][version 3]Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-21 Thread Qing Zhao via Gcc-patches
Got it.

thanks.

Qing

> On Oct 21, 2020, at 10:47 AM, Richard Sandiford  
> wrote:
> 
> Qing Zhao  writes:
 +  /* For each of the hard registers, check to see whether we should zero 
 it if:
 + 1. it is a call-used-registers;
 + and 2. it is not a fixed-registers;
 + and 3. it is not live at the return of the routine;
 + and 4. it is general registor if gpr_only is true;
 + and 5. it is used in the routine if used_only is true;
 + and 6. it is a register that passes parameter if arg_only is true;
 +   */
 +
 +  HARD_REG_SET need_zeroed_hardregs;
 +  CLEAR_HARD_REG_SET (need_zeroed_hardregs);
 +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
 +{
 +  if (!this_target_hard_regs->x_call_used_regs[regno])
 +  continue;
>>> 
>>> This should use crtl->abi instead.  The set of call-used registers
>>> can vary from function to function.
>> 
>> You mean to use:
>> 
>> If (!crtl->abi->clobbers_full_reg_p(regno))
>> 
>> ?
> 
> Yeah, that's right.  (But with a space before “(regno)” :-))
> 
 +static unsigned int
 +rest_of_zero_call_used_regs (void)
 +{
 +  basic_block bb;
 +  rtx_insn *insn;
 +
 +  /* This pass needs data flow information.  */
 +  df_analyze ();
 +
 +  /* Search all the "return"s in the routine, and insert instruction 
 sequence to
 + zero the call used registers.  */
 +  FOR_EACH_BB_REVERSE_FN (bb, cfun)
 +if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun)
 +  || (single_succ_p (bb)
 +  && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun)))
 +  FOR_BB_INSNS_REVERSE (bb, insn)
 +  if (JUMP_P (insn) && ANY_RETURN_P (JUMP_LABEL (insn)))
 +{
 +  /* Now we can insert the instruction sequence to zero the call used
 + registers before this insn.  */
 +  gen_call_used_regs_seq (insn);
 +  break;
 +}
>>> 
>>> The exit block never has instructions, so it's only necessary to process
>>> predecessors.  A simpler way to do that is to iterate over the edges in:
>>> 
>>> EXIT_BLOCK_PTR_FOR_FN (cfun)->preds
>>> 
>>> You shouldn't need to use FOR_BB_INSNS_REVERSE: it should be enough
>>> to check only BB_END (bb), since returns always end blocks.
>> 
>> Something like the following?
>> 
>>  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
>>{
>> insn = BB_END (e->src);
>>  If (JUMP_P (insn) && ANY_RETURN_P (JUMP_LABEL (insn)))
>>{
>>/* Now we can insert the instruction sequence to zero the call used
>> registers before this insn.  */
>>  gen_call_used_regs_seq (insn);
>>  break;   
>>}
>>  }
> 
> With this you don't want/need the break, since it would break out
> of the edge traversal (instead of the FOR_BB_INSNS_REVERSE, as above).
> Also, I think the code becomes simple enough that the comment isn't
> really needed.
> 
> Thanks,
> Richard



Re: [Ada] Improve precision of Ada.Directories.Modification_Time

2020-10-21 Thread Iain Sandoe

Arnaud Charlet  wrote:


This patch breaks bootstrap on Darwin platforms.

Pierre-Marie de Rodat  wrote:


The modification file time precision now defined by OS.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* adaint.c (__gnat_file_time): New routine.
(__gnat_copy_attribs): Copy timestamps in nanoseconds.
* libgnat/a-direct.adb (C_Modification_Time): Bind to
__gnat_file_time.
(Modification_Time): Call to C_Modification_Time.


#if defined(st_mtime)

is a necessary test - but the fields in the stat structure on Darwin  
platforms are

named st_{a,c,m}timespec rather than the Linux st_{a,c,m}tim.


What about instead putting above extern long long __gnat_file_time the
following:

#if __APPLE__
#define st_mtim st_mtimespec
#define st_atim st_atimespec
#endif

To avoid having the two (nested) #if __APPLE__ and keep the code easier
to follow?


works for me (the test patch was drafted quickly to allow bootstrap to  
continue)

- I can amend the patch and (re-)test more widely.

Iain



Re: [PATCH] c++: Handle RANGE_EXPR indexes in init_subob_ctx [PR97328]

2020-10-21 Thread Patrick Palka via Gcc-patches
On Thu, 8 Oct 2020, Patrick Palka wrote:

> In the testcase below, we're ICEing during constexpr evaluation of the
> CONSTRUCTOR {.data={{}, [1 ... 7]={}}} of type 'vector'.  The apparently
> unique thing about this CONSTRUCTOR is that it has a RANGE_EXPR index
> whose corresponding sub-aggregate initializer doesn't satisfy
> reduced_constant_expression_p (because its field 't' is uninitialized).
> 
> This is a problem because init_subob_ctx currently punts if the
> constructor index is a RANGE_EXPR, so when cxx_eval_bare_aggregate
> recurses into this sub-aggregate initializer we trip over the
> same_type_p assert in verify_ctor_sanity.
> 
> Fix this by making init_subob_ctx set up an appropriate sub-aggregate
> initialization context even when the index is a RANGE_EXPR.
> 
> Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
> trunk and the 10 branch?
> 
> gcc/cp/ChangeLog:
> 
>   PR c++/97328
>   * constexpr.c (init_subob_ctx): Don't punt if the index is a
>   RANGE_EXPR, instead build a sub-aggregate initialization context
>   with no subobject.
> 
> gcc/testsuite/ChangeLog:
> 
>   PR c++/97328
>   * g++.dg/cpp2a/constexpr-init19.C: New test.
>   * g++.dg/cpp2a/constexpr-init20.C: New test.

Ping.

> ---
>  gcc/cp/constexpr.c| 13 +++--
>  gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C | 15 +++
>  gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C | 15 +++
>  3 files changed, 37 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C
>  create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C
> 
> diff --git a/gcc/cp/constexpr.c b/gcc/cp/constexpr.c
> index a118f8a810b..e50a2a220cb 100644
> --- a/gcc/cp/constexpr.c
> +++ b/gcc/cp/constexpr.c
> @@ -3953,11 +3953,6 @@ init_subob_ctx (const constexpr_ctx *ctx, 
> constexpr_ctx _ctx,
>  {
>new_ctx = *ctx;
>  
> -  if (index && TREE_CODE (index) != INTEGER_CST
> -  && TREE_CODE (index) != FIELD_DECL)
> -/* This won't have an element in the new CONSTRUCTOR.  */
> -return;
> -
>tree type = initialized_type (value);
>if (!AGGREGATE_TYPE_P (type) && !VECTOR_TYPE_P (type))
>  /* A non-aggregate member doesn't get its own CONSTRUCTOR.  */
> @@ -3967,7 +3962,13 @@ init_subob_ctx (const constexpr_ctx *ctx, 
> constexpr_ctx _ctx,
>   update object to refer to the subobject and ctor to refer to
>   the (newly created) sub-initializer.  */
>if (ctx->object)
> -new_ctx.object = build_ctor_subob_ref (index, type, ctx->object);
> +{
> +  if (index == NULL_TREE || TREE_CODE (index) == RANGE_EXPR)
> + /* There's no well-defined subobject for this index.  */
> + new_ctx.object = NULL_TREE;
> +  else
> + new_ctx.object = build_ctor_subob_ref (index, type, ctx->object);
> +}
>tree elt = build_constructor (type, NULL);
>CONSTRUCTOR_NO_CLEARING (elt) = true;
>new_ctx.ctor = elt;
> diff --git a/gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C 
> b/gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C
> new file mode 100644
> index 000..d354c5ad609
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/cpp2a/constexpr-init19.C
> @@ -0,0 +1,15 @@
> +// PR c++/97328
> +// { dg-do compile { target c++20 } }
> +
> +struct vector {
> +  struct storage {
> +int t;
> +constexpr storage() {}
> +  } data[8];
> +};
> +
> +constexpr auto foo() {
> +  vector i;
> +  return i;
> +}
> +auto val = foo();
> diff --git a/gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C 
> b/gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C
> new file mode 100644
> index 000..1a6ed8d86dd
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/cpp2a/constexpr-init20.C
> @@ -0,0 +1,15 @@
> +// PR c++/97328
> +// { dg-do compile { target c++20 } }
> +
> +struct vector {
> +  union storage {
> +int t;
> +constexpr storage() {}
> +  } data[8];
> +};
> +
> +constexpr auto foo() {
> +  vector i;
> +  return i;
> +}
> +auto val = foo();
> -- 
> 2.29.0.rc0
> 
> 



[PATCH] c++: constexpr evaluation and bare EMPTY_CLASS_EXPR [PR96575]

2020-10-21 Thread Patrick Palka via Gcc-patches
In the testcase below, folding of the initializer for 'ret' inside the
instantiated f::lambda ends up yielding an initializer for which
potential_constant_expression returns false.  This causes finish_function
to mark the lambda as non-constexpr, which ultimately causes us to reject
'f(g)' as a call to a non-constexpr function.

The initializer for 'ret' inside f::lambda, prior to folding, is
the CALL_EXPR

  ::operator() (, ({}, <<< Unknown tree: empty_class_expr >>>;))

where the second argument is a COMPOUND_EXPR whose second operand is an
EMPTY_CLASS_EXPR, formed by build_class_a.  cp_fully_fold_init is able
to only partially fold this initializer, doing away with the COMPOUND_EXPR
to yield

  ::operator() (, <<< Unknown tree: empty_class_expr >>>)

as the final initializer for 'ret'.  This initializer no longer satifies
potential_constant_expression because this predicate returns false when
it sees a bare EMPTY_CLASS_EXPR that's not wrapped in a COMPOUND_EXPR.

(cp_fully_fold_init first tries maybe_constant_value on the original
CALL_EXPR, but constexpr evaluation punts upon seeing
__builtin_is_constant_evaluated, since manifestly_const_eval is false.)

To fix this, it seems to me we could either make cp_fold preserve
the COMPOUND_EXPR trees produced by build_call_a, or we could
modify potential_constant_expression and friends to handle "bare"
EMPTY_CLASS_EXPR trees.  Assuming it's safe to continue folding
away these COMPOUND_EXPRs, the second approach seems cleaner, so this
patch implements the second approach.

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
trunk?

gcc/cp/ChangeLog:

PR c++/96575
* constexpr.c (cxx_eval_constant_expression)
: Remove now-redundant handling of
COMPOUND_EXPR with EMPTY_CLASS_EXPR second operand.
: Lower it into a CONSTRUCTOR.
(potential_constant_expression_1) : Remove
now-redundant handling of COMPOUND_EXPR with EMPTY_CLASS_EXPR
second operand.
: Return true instead of false.

gcc/testsuite/ChangeLog:

PR c++/96575
* g++.dg/cpp1z/constexpr-96575.C: New test.
---
 gcc/cp/constexpr.c   | 20 
 gcc/testsuite/g++.dg/cpp1z/constexpr-96575.C | 19 +++
 2 files changed, 27 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp1z/constexpr-96575.C

diff --git a/gcc/cp/constexpr.c b/gcc/cp/constexpr.c
index a118f8a810b..0c13ff4db71 100644
--- a/gcc/cp/constexpr.c
+++ b/gcc/cp/constexpr.c
@@ -6070,13 +6070,11 @@ cxx_eval_constant_expression (const constexpr_ctx *ctx, 
tree t,
 case COMPOUND_EXPR:
   {
/* check_return_expr sometimes wraps a TARGET_EXPR in a
-  COMPOUND_EXPR; don't get confused.  Also handle EMPTY_CLASS_EXPR
-  introduced by build_call_a.  */
+  COMPOUND_EXPR; don't get confused.  */
tree op0 = TREE_OPERAND (t, 0);
tree op1 = TREE_OPERAND (t, 1);
STRIP_NOPS (op1);
-   if ((TREE_CODE (op0) == TARGET_EXPR && op1 == TARGET_EXPR_SLOT (op0))
-   || TREE_CODE (op1) == EMPTY_CLASS_EXPR)
+   if (TREE_CODE (op0) == TARGET_EXPR && op1 == TARGET_EXPR_SLOT (op0))
  r = cxx_eval_constant_expression (ctx, op0,
lval, non_constant_p, overflow_p,
jump_target);
@@ -6403,9 +6401,9 @@ cxx_eval_constant_expression (const constexpr_ctx *ctx, 
tree t,
   break;
 
 case EMPTY_CLASS_EXPR:
-  /* This is good enough for a function argument that might not get
-used, and they can't do anything with it, so just return it.  */
-  return t;
+  /* Handle EMPTY_CLASS_EXPR produced by build_call_a by lowering
+it to an appropriate CONSTRUCTOR.  */
+  return build_constructor (TREE_TYPE (t), NULL);
 
 case STATEMENT_LIST:
   new_ctx = *ctx;
@@ -8186,13 +8184,11 @@ potential_constant_expression_1 (tree t, bool 
want_rval, bool strict, bool now,
 case COMPOUND_EXPR:
   {
/* check_return_expr sometimes wraps a TARGET_EXPR in a
-  COMPOUND_EXPR; don't get confused.  Also handle EMPTY_CLASS_EXPR
-  introduced by build_call_a.  */
+  COMPOUND_EXPR; don't get confused.  */
tree op0 = TREE_OPERAND (t, 0);
tree op1 = TREE_OPERAND (t, 1);
STRIP_NOPS (op1);
-   if ((TREE_CODE (op0) == TARGET_EXPR && op1 == TARGET_EXPR_SLOT (op0))
-   || TREE_CODE (op1) == EMPTY_CLASS_EXPR)
+   if (TREE_CODE (op0) == TARGET_EXPR && op1 == TARGET_EXPR_SLOT (op0))
  return RECUR (op0, want_rval);
else
  goto binary;
@@ -8321,7 +8317,7 @@ potential_constant_expression_1 (tree t, bool want_rval, 
bool strict, bool now,
   return true;
 
 case EMPTY_CLASS_EXPR:
-  return false;
+  return true;
 
 case GOTO_EXPR:
   {
diff --git a/gcc/testsuite/g++.dg/cpp1z/constexpr-96575.C 

Re: [Ada] Improve precision of Ada.Directories.Modification_Time

2020-10-21 Thread Arnaud Charlet
> This patch breaks bootstrap on Darwin platforms.
> 
> Pierre-Marie de Rodat  wrote:
> 
> > The modification file time precision now defined by OS.
> > 
> > Tested on x86_64-pc-linux-gnu, committed on trunk
> > 
> > gcc/ada/
> > 
> > * adaint.c (__gnat_file_time): New routine.
> > (__gnat_copy_attribs): Copy timestamps in nanoseconds.
> > * libgnat/a-direct.adb (C_Modification_Time): Bind to
> > __gnat_file_time.
> > (Modification_Time): Call to C_Modification_Time.
> 
> #if defined(st_mtime)
> 
> is a necessary test - but the fields in the stat structure on Darwin 
> platforms are
> named st_{a,c,m}timespec rather than the Linux st_{a,c,m}tim.

What about instead putting above extern long long __gnat_file_time the
following:

#if __APPLE__
#define st_mtim st_mtimespec
#define st_atim st_atimespec
#endif

To avoid having the two (nested) #if __APPLE__ and keep the code easier
to follow?

Arno


Re: [PATCH] [PR rtl-optimization/97249]Simplify vec_select of paradoxical subreg.

2020-10-21 Thread Segher Boessenkool
On Wed, Oct 21, 2020 at 04:43:29PM +0100, Richard Sandiford wrote:
> Hongtao Liu  writes:
> > + poly_uint64 nunits
> > +   = GET_MODE_NUNITS (GET_MODE (SUBREG_REG (trueop0)));
> > + rtx par = trueop1;
> > + for (int i = 0; i != l1; i++)
> > +   {
> > + rtx idx = XVECEXP (trueop1, 0, i);
> > + if (!CONST_INT_P (idx)
> > + || maybe_ge (UINTVAL (idx) + subreg_offset, nunits))
> > +   return 0;
> > +   }
> 
> I think the previous version was better.  We shouldn't assume that
> further simplification rules will fail just because the conditions
> for this rule haven't been met.

Yes.  My suggestion was to factor this big piece of code to a separate
function, and do an early return from *that*.

The patch is okay for trunk without that, with the clumsy booleans.
Thanks Hongtao!


Segher


Re: [Ada] Improve precision of Ada.Directories.Modification_Time

2020-10-21 Thread Iain Sandoe via Gcc-patches
Hi Folks,

This patch breaks bootstrap on Darwin platforms.

Pierre-Marie de Rodat  wrote:

> The modification file time precision now defined by OS.
> 
> Tested on x86_64-pc-linux-gnu, committed on trunk
> 
> gcc/ada/
> 
>   * adaint.c (__gnat_file_time): New routine.
>   (__gnat_copy_attribs): Copy timestamps in nanoseconds.
>   * libgnat/a-direct.adb (C_Modification_Time): Bind to
>   __gnat_file_time.
>   (Modification_Time): Call to C_Modification_Time.

#if defined(st_mtime)

is a necessary test - but the fields in the stat structure on Darwin platforms 
are
named st_{a,c,m}timespec rather than the Linux st_{a,c,m}tim.

The following patch is a fix lightly tested,
OK for master (if remaining testing is successful) or you have an alternate 
suggestion?

thanks
Iain

diff --git a/gcc/ada/adaint.c b/gcc/ada/adaint.c
index b7406a03c31..ac5738a60d2 100644
--- a/gcc/ada/adaint.c
+++ b/gcc/ada/adaint.c
@@ -1528,8 +1528,12 @@ extern long long __gnat_file_time(char* name)
 #if defined(__GNUG__) && __GNUG__ <= 4
 result = (sb.st_mtime - ada_epoch_offset) * 1E9;
 #if defined(st_mtime)
+#if __APPLE__
+result += sb.st_mtimespec.tv_nsec;
+#else
 result += sb.st_mtim.tv_nsec;
 #endif
+#endif
 #else
   /* Next code similar to
  (sb.st_mtime - ada_epoch_offset) * 1E9 + sb.st_mtim.tv_nsec
@@ -1544,11 +1548,17 @@ extern long long __gnat_file_time(char* name)
   }
 
 #if defined(st_mtime)
+#if __APPLE__
+  if (__builtin_saddll_overflow(result, sb.st_mtimespec.tv_nsec, )) {
+return LLONG_MIN;
+  }
+#else
   if (__builtin_saddll_overflow(result, sb.st_mtim.tv_nsec, )) {
 return LLONG_MIN;
   }
 #endif
 #endif
+#endif
 #endif
   return result;
 }
@@ -3278,8 +3288,13 @@ __gnat_copy_attribs (char *from ATTRIBUTE_UNUSED, char 
*to ATTRIBUTE_UNUSED,
  tbuf[1].tv_sec  = fbuf.st_mtime;
 
  #if defined(st_mtime)
+ #if __APPLE__
+ tbuf[0].tv_usec = fbuf.st_atimespec.tv_nsec / 1000;
+ tbuf[1].tv_usec = fbuf.st_mtimespec.tv_nsec / 1000;
+ #else
  tbuf[0].tv_usec = fbuf.st_atim.tv_nsec / 1000;
  tbuf[1].tv_usec = fbuf.st_mtim.tv_nsec / 1000;
+ #endif
  #else
  tbuf[0].tv_usec = 0;
  tbuf[1].tv_usec = 0;



Re: [PATCH] x86: Allow configuring with --with-arch_64=x86-64-v[234]

2020-10-21 Thread Uros Bizjak via Gcc-patches
On Wed, Oct 21, 2020 at 5:15 PM Jakub Jelinek  wrote:
>
> On Wed, Sep 30, 2020 at 06:06:31PM +0200, Florian Weimer wrote:
> > --- a/gcc/common/config/i386/i386-common.c
> > +++ b/gcc/common/config/i386/i386-common.c
> > @@ -1795,9 +1795,13 @@ const pta processor_alias_table[] =
> >  PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR, 0, P_NONE},
> >{"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
> >  PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR, 0, P_NONE},
> > -  {"x86-64", PROCESSOR_K8, CPU_K8,
> > -PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR,
> > -0, P_NONE},
> > +  {"x86-64", PROCESSOR_K8, CPU_K8, PTA_X86_64_BASELINE, 0, P_NONE},
> > +  {"x86-64-v2", PROCESSOR_K8, CPU_GENERIC, PTA_X86_64_V2 | PTA_NO_TUNE,
> > +   0, P_NONE},
> > +  {"x86-64-v3", PROCESSOR_K8, CPU_GENERIC, PTA_X86_64_V3 | PTA_NO_TUNE,
> > +   0, P_NONE},
> > +  {"x86-64-v4", PROCESSOR_K8, CPU_GENERIC, PTA_X86_64_V4 | PTA_NO_TUNE,
> > +   0, P_NONE},
> >{"eden-x2", PROCESSOR_K8, CPU_K8,
> >  PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR,
> >  0, P_NONE},
>
> I have noticed that one can't configure gcc to default to these.
>
> I've also found various other 32-bit or 64-bit -march= arguments for which
> it wasn't possible to configure gcc to default to those.
>
> The x86-64-v* the patch only allows in --with-arch_64=, because otherwise
> it fails build miserably - as
> ./xgcc -B ./ -S -march=x86-64-v2 -m32 test.c
> cc1: error: ‘x86-64-v2’ architecture level is only defined for the x86-64 
> psABI
> when building 32-bit multilibs.  Even if multilibs are disallowed, I think
> the compiler still supports -m32 and so --with-arch_64= seems to be the only
> option in which we can support that.
>
> Ok for trunk if this passes bootstrap/regtest?  So far I've just tested that
> e.g. --with-tune=x86-64-v3 build fails as expected and 
> --with-arch_64=x86-64-v3
> one went fine.
>
> 2020-10-21  Jakub Jelinek  
>
> * config.gcc (x86_archs): Add samuel-2, nehemiah, c7 and esther.
> (x86_64_archs): Add eden-x2, nano, nano-1000, nano-2000, nano-3000,
> nano-x2, eden-x4, nano-x4, x86-64-v2, x86-64-v3 and x86-64-v4.
> (i[34567]86-*-* | x86_64-*-*): Only allow x86-64-v* as argument
> to --with-arch_64=.

LGTM.

Thanks,
Uros.

>
> --- gcc/config.gcc.jj   2020-10-15 09:04:50.614521860 +0200
> +++ gcc/config.gcc  2020-10-21 17:03:10.396077993 +0200
> @@ -662,7 +662,8 @@ tm_defines="$tm_defines LIBC_GLIBC=1 LIB
>  x86_archs="athlon athlon-4 athlon-fx athlon-mp athlon-tbird \
>  athlon-xp k6 k6-2 k6-3 geode c3 c3-2 winchip-c6 winchip2 i386 i486 \
>  i586 i686 pentium pentium-m pentium-mmx pentium2 pentium3 pentium3m \
> -pentium4 pentium4m pentiumpro prescott lakemont"
> +pentium4 pentium4m pentiumpro prescott lakemont samuel-2 nehemiah \
> +c7 esther"
>
>  # 64-bit x86 processors supported by --with-arch=.  Each processor
>  # MUST be separated by exactly one space.
> @@ -672,7 +673,8 @@ opteron-sse3 nocona core2 corei7 corei7-
>  slm nehalem westmere sandybridge ivybridge haswell broadwell bonnell \
>  silvermont knl knm skylake-avx512 cannonlake icelake-client icelake-server \
>  skylake goldmont goldmont-plus tremont cascadelake tigerlake cooperlake \
> -sapphirerapids alderlake x86-64 native"
> +sapphirerapids alderlake eden-x2 nano nano-1000 nano-2000 nano-3000 \
> +nano-x2 eden-x4 nano-x4 x86-64 x86-64-v2 x86-64-v3 x86-64-v4 native"
>
>  # Additional x86 processors supported by --with-cpu=.  Each processor
>  # MUST be separated by exactly one space.
> @@ -4458,6 +4460,17 @@ case "${target}" in
> if test x${val} != x; then
> case " $x86_64_archs " in
> *" ${val} "*)
> +   # Disallow x86-64-v* for 
> --with-cpu=/--with-tune=
> +   # or --with-arch= or 
> --with-arch_32=
> +   # It can be only specified in 
> --with-arch_64=
> +   case "x$which$val" in
> +   
> xcpu*x86-64-v*|xtune*x86-64-v*|xarchx86-64-v*|xarch_32x86-64-v*)
> +   echo "Unknown CPU 
> given in --with-$which=$val." 1>&2
> +   exit 1
> +   ;;
> +   *)
> +   ;;
> +   esac
> # OK
> ;;
> *)
>
> Jakub
>


Re: [PATCH] arm: Fix multiple inheritance thunks for thumb-1 with -mpure-code

2020-10-21 Thread Christophe Lyon via Gcc-patches
On Wed, 21 Oct 2020 at 18:07, Richard Earnshaw
 wrote:
>
> On 21/10/2020 16:49, Christophe Lyon via Gcc-patches wrote:
> > On Tue, 20 Oct 2020 at 13:25, Richard Earnshaw
> >  wrote:
> >>
> >> On 20/10/2020 12:22, Richard Earnshaw wrote:
> >>> On 19/10/2020 17:32, Christophe Lyon via Gcc-patches wrote:
>  On Mon, 19 Oct 2020 at 16:39, Richard Earnshaw
>   wrote:
> >
> > On 12/10/2020 08:59, Christophe Lyon via Gcc-patches wrote:
> >> On Thu, 8 Oct 2020 at 11:58, Richard Earnshaw
> >>  wrote:
> >>>
> >>> On 08/10/2020 10:07, Christophe Lyon via Gcc-patches wrote:
>  On Tue, 6 Oct 2020 at 18:02, Richard Earnshaw
>   wrote:
> >
> > On 29/09/2020 20:50, Christophe Lyon via Gcc-patches wrote:
> >> When mi_delta is > 255 and -mpure-code is used, we cannot load 
> >> delta
> >> from code memory (like we do without -mpure-code).
> >>
> >> This patch builds the value of mi_delta into r3 with a series of
> >> movs/adds/lsls.
> >>
> >> We also do some cleanup by not emitting the function address and 
> >> delta
> >> via .word directives at the end of the thunk since we don't use 
> >> them
> >> with -mpure-code.
> >>
> >> No need for new testcases, this bug was already identified by
> >> eg. pr46287-3.C
> >>
> >> 2020-09-29  Christophe Lyon  
> >>
> >>   gcc/
> >>   * config/arm/arm.c (arm_thumb1_mi_thunk): Build mi_delta in 
> >> r3 and
> >>   do not emit function address and delta when -mpure-code is 
> >> used.
> >
>  Hi Richard,
> 
>  Thanks for your comments.
> 
> > There are some optimizations you can make to this code.
> >
> > Firstly, for values between 256 and 510 (inclusive), it would be 
> > better
> > to just expand a mov of 255 followed by an add.
>  I now see the splitted for the "Pe" constraint which I hadn't noticed
>  before, so I can write something similar indeed.
> 
>  However, I'm note quite sure to understand the benefit in the split
>  when -mpure-code is NOT used.
>  Consider:
>  int f3_1 (void) { return 510; }
>  int f3_2 (void) { return 511; }
>  Compile with -O2 -mcpu=cortex-m0:
>  f3_1:
>  movsr0, #255
>  lslsr0, r0, #1
>  bx  lr
>  f3_2:
>  ldr r0, .L4
>  bx  lr
> 
>  The splitter makes the code bigger, does it "compensate" for this by
>  not having to load the constant?
>  Actually the constant uses 4 more bytes, which should be taken into
>  account when comparing code size,
> >>>
> >>> Yes, the size of the literal pool entry needs to be taken into 
> >>> account.
> >>>  It might happen that the entry could be shared with another use of 
> >>> that
> >>> literal, but in general that's rare.
> >>>
>  so f3_1 uses 6 bytes, and f3_2 uses 8, so as you say below three
>  thumb1 instructions would be equivalent in size compared to loading
>  from the literal pool. Should the 256-510 range be extended?
> >>>
> >>> It's a bit borderline at three instructions when literal pools are not
> >>> expensive to use, but in thumb1 literal pools tend to be quite small 
> >>> due
> >>> to the limited pc offsets we can use.  I think on balance we probably
> >>> want to use the instruction sequence unless optimizing for size.
> >>>
> 
> 
> > This is also true for
> > the literal pools alternative as well, so should be handled before 
> > all
> > this.
>  I am not sure what you mean: with -mpure-code, the above sample is 
>  compiled as:
>  f3_1:
>  movsr0, #255
>  lslsr0, r0, #1
>  bx  lr
>  f3_2:
>  movsr0, #1
>  lslsr0, r0, #8
>  addsr0, r0, #255
>  bx  lr
> 
>  so the "return 510" case is already handled as without -mpure-code.
> >>>
> >>> I was thinking specifically of the thunk sequence where you seem to be
> >>> emitting instructions directly rather than generating RTL.  The 
> >>> examples
> >>> you show here are not thunks.
> >>>
> >> OK thanks for the clarification.
> >>
> >> Here is an updated version, split into 3 patches to hopefully make
> >> review easier.
> >> They apply on top of my other mpure-code patches for PR96967 and 
> >> PR96770:
> >> https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554956.html
> >> 

Re: [PATCH][middle-end][i386][version 3]Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-21 Thread Uros Bizjak via Gcc-patches
On Wed, Oct 21, 2020 at 4:45 PM Qing Zhao  wrote:
>
>
>
> On Oct 21, 2020, at 3:03 AM, Uros Bizjak  wrote:
>
> On Wed, Oct 21, 2020 at 9:18 AM Uros Bizjak  wrote:
>
>
> On Tue, Oct 20, 2020 at 10:04 PM Qing Zhao  wrote:
>
> +/* Check whether the register REGNO should be zeroed on X86.
> +   When ALL_SSE_ZEROED is true, all SSE registers have been zeroed
> +   together, no need to zero it again.
> +   Stack registers (st0-st7) and mm0-mm7 are aliased with each other.
> +   very hard to be zeroed individually, don't zero individual st or
> +   mm registgers at this time.  */
> +
> +static bool
> +zero_call_used_regno_p (const unsigned int regno,
> + bool all_sse_zeroed)
> +{
> +  return GENERAL_REGNO_P (regno)
> +  || (!all_sse_zeroed && SSE_REGNO_P (regno))
> +  || MASK_REGNO_P (regno);
> +}
> +
> +/* Return the machine_mode that is used to zero register REGNO.  */
> +
> +static machine_mode
> +zero_call_used_regno_mode (const unsigned int regno)
> +{
> +  /* NB: We only need to zero the lower 32 bits for integer registers
> + and the lower 128 bits for vector registers since destination are
> + zero-extended to the full register width.  */
> +  if (GENERAL_REGNO_P (regno))
> +return SImode;
> +  else if (SSE_REGNO_P (regno))
> +return V4SFmode;
> +  else
> +return HImode;
> +}
> +
> +/* Generate a rtx to zero all vector registers togetehr if possible,
> +   otherwise, return NULL.  */
> +
> +static rtx
> +zero_all_vector_registers (HARD_REG_SET need_zeroed_hardregs)
> +{
> +  if (!TARGET_AVX)
> +return NULL;
> +
> +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> +if ((IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG)
> +  || (TARGET_64BIT
> +  && (REX_SSE_REGNO_P (regno)
> +  || (TARGET_AVX512F && EXT_REX_SSE_REGNO_P (regno)
> + && !TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
> +  return NULL;
> +
> +  return gen_avx_vzeroall ();
> +}
> +
> +/* Generate a rtx to zero all st and mm registers togetehr if possible,
> +   otherwise, return NULL.  */
> +
> +static rtx
> +zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs)
> +{
> +  if (!TARGET_MMX)
> +return NULL;
> +
> +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> +if ((STACK_REGNO_P (regno) || MMX_REGNO_P (regno))
> + && !TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
> +  return NULL;
> +
> +  return gen_mmx_emms ();
>
>
> emms is not clearing any register, it only loads x87FPUTagWord with
> H. So I think, the above is useless, as far as register clearing
> is concerned.
>
>
> Thanks for the info.
>
> So, for mm and st registers, should we clear them, and how?
>
>
> I don't know.
>
> Please note that %mm and %st share the same register file, and
> touching %mm registers will block access to %st until emms is emitted.
> You can't just blindly load 0 to %st registers, because the register
> file can be in MMX mode and vice versa. For 32bit targets, function
> can also  return a value in the %mm0.
>
>
> If data flow determine that %mm0 does not return a value at the return, can 
> we clear all the %st as following:
>
> emms
> mov %st0, 0
> mov %st1, 0
> mov %st2, 0
> mov %st3, 0
> mov %st4, 0
> mov %st5, 0
> mov %st6, 0
> mov %st7, 0
>
>
> The i386 ABI says:
>
> -- q --
> The CPU shall be in x87 mode upon entry to a function. Therefore,
> every function that uses the MMX registers is required to issue an
> emms or femms instruction after using MMX registers, before returning
> or calling another function.
> -- /q --
>
> (The above requirement slightly contradicts its own ABI, since we have
> 3 MMX argument registers and MMX return register, so the CPU obviously
> can't be in x87 mode at all function boundaries).
>
> So, assuming that the first sentence is not deliberately vague w.r.t
> function exit, emms should not be needed. However, we are dealing with
> x87 stack registers that have their own set of peculiarities. It is
> not possible to load a random register in the way you show.  Also,
> stack should be either empty or one (two in case of complex value
> return) levels deep at the function return. I think you want a series
> of 8 or 7(6) fldz insns, followed by a series of fstp insn to clear
> the stack and mark stack slots empty.
>
>
> Something like this:
>
> --cut here--
> long double
> __attribute__ ((noinline))
> test (long double a, long double b)
> {
>  long double r = a + b;
>
>  asm volatile ("fldz;\
>fldz;\
>fldz;\
>fldz;\
>fldz;\
>fldz;\
>fldz;\
>fstp %%st(0);\
>fstp %%st(0);\
>fstp %%st(0);\
>fstp %%st(0);\
>fstp %%st(0);\
>fstp %%st(0);\
>fstp %%st(0)" : : "X"(r));
>  return r;
> }
>
> int
> main ()
> {
>  long double a = 1.1, b = 1.2;
>
>  

Re: [PATCH] arm: Fix multiple inheritance thunks for thumb-1 with -mpure-code

2020-10-21 Thread Richard Earnshaw via Gcc-patches
On 21/10/2020 16:49, Christophe Lyon via Gcc-patches wrote:
> On Tue, 20 Oct 2020 at 13:25, Richard Earnshaw
>  wrote:
>>
>> On 20/10/2020 12:22, Richard Earnshaw wrote:
>>> On 19/10/2020 17:32, Christophe Lyon via Gcc-patches wrote:
 On Mon, 19 Oct 2020 at 16:39, Richard Earnshaw
  wrote:
>
> On 12/10/2020 08:59, Christophe Lyon via Gcc-patches wrote:
>> On Thu, 8 Oct 2020 at 11:58, Richard Earnshaw
>>  wrote:
>>>
>>> On 08/10/2020 10:07, Christophe Lyon via Gcc-patches wrote:
 On Tue, 6 Oct 2020 at 18:02, Richard Earnshaw
  wrote:
>
> On 29/09/2020 20:50, Christophe Lyon via Gcc-patches wrote:
>> When mi_delta is > 255 and -mpure-code is used, we cannot load delta
>> from code memory (like we do without -mpure-code).
>>
>> This patch builds the value of mi_delta into r3 with a series of
>> movs/adds/lsls.
>>
>> We also do some cleanup by not emitting the function address and 
>> delta
>> via .word directives at the end of the thunk since we don't use them
>> with -mpure-code.
>>
>> No need for new testcases, this bug was already identified by
>> eg. pr46287-3.C
>>
>> 2020-09-29  Christophe Lyon  
>>
>>   gcc/
>>   * config/arm/arm.c (arm_thumb1_mi_thunk): Build mi_delta in r3 
>> and
>>   do not emit function address and delta when -mpure-code is 
>> used.
>
 Hi Richard,

 Thanks for your comments.

> There are some optimizations you can make to this code.
>
> Firstly, for values between 256 and 510 (inclusive), it would be 
> better
> to just expand a mov of 255 followed by an add.
 I now see the splitted for the "Pe" constraint which I hadn't noticed
 before, so I can write something similar indeed.

 However, I'm note quite sure to understand the benefit in the split
 when -mpure-code is NOT used.
 Consider:
 int f3_1 (void) { return 510; }
 int f3_2 (void) { return 511; }
 Compile with -O2 -mcpu=cortex-m0:
 f3_1:
 movsr0, #255
 lslsr0, r0, #1
 bx  lr
 f3_2:
 ldr r0, .L4
 bx  lr

 The splitter makes the code bigger, does it "compensate" for this by
 not having to load the constant?
 Actually the constant uses 4 more bytes, which should be taken into
 account when comparing code size,
>>>
>>> Yes, the size of the literal pool entry needs to be taken into account.
>>>  It might happen that the entry could be shared with another use of that
>>> literal, but in general that's rare.
>>>
 so f3_1 uses 6 bytes, and f3_2 uses 8, so as you say below three
 thumb1 instructions would be equivalent in size compared to loading
 from the literal pool. Should the 256-510 range be extended?
>>>
>>> It's a bit borderline at three instructions when literal pools are not
>>> expensive to use, but in thumb1 literal pools tend to be quite small due
>>> to the limited pc offsets we can use.  I think on balance we probably
>>> want to use the instruction sequence unless optimizing for size.
>>>


> This is also true for
> the literal pools alternative as well, so should be handled before all
> this.
 I am not sure what you mean: with -mpure-code, the above sample is 
 compiled as:
 f3_1:
 movsr0, #255
 lslsr0, r0, #1
 bx  lr
 f3_2:
 movsr0, #1
 lslsr0, r0, #8
 addsr0, r0, #255
 bx  lr

 so the "return 510" case is already handled as without -mpure-code.
>>>
>>> I was thinking specifically of the thunk sequence where you seem to be
>>> emitting instructions directly rather than generating RTL.  The examples
>>> you show here are not thunks.
>>>
>> OK thanks for the clarification.
>>
>> Here is an updated version, split into 3 patches to hopefully make
>> review easier.
>> They apply on top of my other mpure-code patches for PR96967 and PR96770:
>> https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554956.html
>> https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554957.html
>>
>> I kept it this way to make incremental changes easier to understand.
>>
>> Patch 1: With the hope to avoid confusion and make maintenance easier,
>> I have updated thumb1_gen_const_int() so that it can generate either RTL 
>> or
>> asm. This way, all the code used to build thumb-1 constants is in the
>> same place,
>>  in 

Re: [PATCH] arm: Fix multiple inheritance thunks for thumb-1 with -mpure-code

2020-10-21 Thread Christophe Lyon via Gcc-patches
On Tue, 20 Oct 2020 at 13:25, Richard Earnshaw
 wrote:
>
> On 20/10/2020 12:22, Richard Earnshaw wrote:
> > On 19/10/2020 17:32, Christophe Lyon via Gcc-patches wrote:
> >> On Mon, 19 Oct 2020 at 16:39, Richard Earnshaw
> >>  wrote:
> >>>
> >>> On 12/10/2020 08:59, Christophe Lyon via Gcc-patches wrote:
>  On Thu, 8 Oct 2020 at 11:58, Richard Earnshaw
>   wrote:
> >
> > On 08/10/2020 10:07, Christophe Lyon via Gcc-patches wrote:
> >> On Tue, 6 Oct 2020 at 18:02, Richard Earnshaw
> >>  wrote:
> >>>
> >>> On 29/09/2020 20:50, Christophe Lyon via Gcc-patches wrote:
>  When mi_delta is > 255 and -mpure-code is used, we cannot load delta
>  from code memory (like we do without -mpure-code).
> 
>  This patch builds the value of mi_delta into r3 with a series of
>  movs/adds/lsls.
> 
>  We also do some cleanup by not emitting the function address and 
>  delta
>  via .word directives at the end of the thunk since we don't use them
>  with -mpure-code.
> 
>  No need for new testcases, this bug was already identified by
>  eg. pr46287-3.C
> 
>  2020-09-29  Christophe Lyon  
> 
>    gcc/
>    * config/arm/arm.c (arm_thumb1_mi_thunk): Build mi_delta in r3 
>  and
>    do not emit function address and delta when -mpure-code is 
>  used.
> >>>
> >> Hi Richard,
> >>
> >> Thanks for your comments.
> >>
> >>> There are some optimizations you can make to this code.
> >>>
> >>> Firstly, for values between 256 and 510 (inclusive), it would be 
> >>> better
> >>> to just expand a mov of 255 followed by an add.
> >> I now see the splitted for the "Pe" constraint which I hadn't noticed
> >> before, so I can write something similar indeed.
> >>
> >> However, I'm note quite sure to understand the benefit in the split
> >> when -mpure-code is NOT used.
> >> Consider:
> >> int f3_1 (void) { return 510; }
> >> int f3_2 (void) { return 511; }
> >> Compile with -O2 -mcpu=cortex-m0:
> >> f3_1:
> >> movsr0, #255
> >> lslsr0, r0, #1
> >> bx  lr
> >> f3_2:
> >> ldr r0, .L4
> >> bx  lr
> >>
> >> The splitter makes the code bigger, does it "compensate" for this by
> >> not having to load the constant?
> >> Actually the constant uses 4 more bytes, which should be taken into
> >> account when comparing code size,
> >
> > Yes, the size of the literal pool entry needs to be taken into account.
> >  It might happen that the entry could be shared with another use of that
> > literal, but in general that's rare.
> >
> >> so f3_1 uses 6 bytes, and f3_2 uses 8, so as you say below three
> >> thumb1 instructions would be equivalent in size compared to loading
> >> from the literal pool. Should the 256-510 range be extended?
> >
> > It's a bit borderline at three instructions when literal pools are not
> > expensive to use, but in thumb1 literal pools tend to be quite small due
> > to the limited pc offsets we can use.  I think on balance we probably
> > want to use the instruction sequence unless optimizing for size.
> >
> >>
> >>
> >>> This is also true for
> >>> the literal pools alternative as well, so should be handled before all
> >>> this.
> >> I am not sure what you mean: with -mpure-code, the above sample is 
> >> compiled as:
> >> f3_1:
> >> movsr0, #255
> >> lslsr0, r0, #1
> >> bx  lr
> >> f3_2:
> >> movsr0, #1
> >> lslsr0, r0, #8
> >> addsr0, r0, #255
> >> bx  lr
> >>
> >> so the "return 510" case is already handled as without -mpure-code.
> >
> > I was thinking specifically of the thunk sequence where you seem to be
> > emitting instructions directly rather than generating RTL.  The examples
> > you show here are not thunks.
> >
>  OK thanks for the clarification.
> 
>  Here is an updated version, split into 3 patches to hopefully make
>  review easier.
>  They apply on top of my other mpure-code patches for PR96967 and PR96770:
>  https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554956.html
>  https://gcc.gnu.org/pipermail/gcc-patches/2020-September/554957.html
> 
>  I kept it this way to make incremental changes easier to understand.
> 
>  Patch 1: With the hope to avoid confusion and make maintenance easier,
>  I have updated thumb1_gen_const_int() so that it can generate either RTL 
>  or
>  asm. This way, all the code used to build thumb-1 constants is in the
>  same place,
>   in case we need to improve/fix it later. We now generate shorter 
> 

Re: [PATCH][middle-end][i386][version 3]Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-21 Thread Richard Sandiford via Gcc-patches
Qing Zhao  writes:
>>> +  /* For each of the hard registers, check to see whether we should zero 
>>> it if:
>>> + 1. it is a call-used-registers;
>>> + and 2. it is not a fixed-registers;
>>> + and 3. it is not live at the return of the routine;
>>> + and 4. it is general registor if gpr_only is true;
>>> + and 5. it is used in the routine if used_only is true;
>>> + and 6. it is a register that passes parameter if arg_only is true;
>>> +   */
>>> +
>>> +  HARD_REG_SET need_zeroed_hardregs;
>>> +  CLEAR_HARD_REG_SET (need_zeroed_hardregs);
>>> +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
>>> +{
>>> +  if (!this_target_hard_regs->x_call_used_regs[regno])
>>> +   continue;
>> 
>> This should use crtl->abi instead.  The set of call-used registers
>> can vary from function to function.
>
> You mean to use:
>
> If (!crtl->abi->clobbers_full_reg_p(regno))
>
> ?

Yeah, that's right.  (But with a space before “(regno)” :-))

>>> +static unsigned int
>>> +rest_of_zero_call_used_regs (void)
>>> +{
>>> +  basic_block bb;
>>> +  rtx_insn *insn;
>>> +
>>> +  /* This pass needs data flow information.  */
>>> +  df_analyze ();
>>> +
>>> +  /* Search all the "return"s in the routine, and insert instruction 
>>> sequence to
>>> + zero the call used registers.  */
>>> +  FOR_EACH_BB_REVERSE_FN (bb, cfun)
>>> +if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun)
>>> +   || (single_succ_p (bb)
>>> +   && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun)))
>>> +  FOR_BB_INSNS_REVERSE (bb, insn)
>>> +   if (JUMP_P (insn) && ANY_RETURN_P (JUMP_LABEL (insn)))
>>> + {
>>> +   /* Now we can insert the instruction sequence to zero the call used
>>> +  registers before this insn.  */
>>> +   gen_call_used_regs_seq (insn);
>>> +   break;
>>> + }
>> 
>> The exit block never has instructions, so it's only necessary to process
>> predecessors.  A simpler way to do that is to iterate over the edges in:
>> 
>>  EXIT_BLOCK_PTR_FOR_FN (cfun)->preds
>> 
>> You shouldn't need to use FOR_BB_INSNS_REVERSE: it should be enough
>> to check only BB_END (bb), since returns always end blocks.
>
> Something like the following?
>
>   FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
> {
>  insn = BB_END (e->src);
>   If (JUMP_P (insn) && ANY_RETURN_P (JUMP_LABEL (insn)))
> {
> /* Now we can insert the instruction sequence to zero the call used
>  registers before this insn.  */
>   gen_call_used_regs_seq (insn);
>   break;   
> }
>   }

With this you don't want/need the break, since it would break out
of the edge traversal (instead of the FOR_BB_INSNS_REVERSE, as above).
Also, I think the code becomes simple enough that the comment isn't
really needed.

Thanks,
Richard


Re: [PATCH] [PR rtl-optimization/97249]Simplify vec_select of paradoxical subreg.

2020-10-21 Thread Richard Sandiford via Gcc-patches
Hongtao Liu  writes:
> +   poly_uint64 nunits
> + = GET_MODE_NUNITS (GET_MODE (SUBREG_REG (trueop0)));
> +   rtx par = trueop1;
> +   for (int i = 0; i != l1; i++)
> + {
> +   rtx idx = XVECEXP (trueop1, 0, i);
> +   if (!CONST_INT_P (idx)
> +   || maybe_ge (UINTVAL (idx) + subreg_offset, nunits))
> + return 0;
> + }

I think the previous version was better.  We shouldn't assume that
further simplification rules will fail just because the conditions
for this rule haven't been met.

Thanks,
Richard


Re: [PATCH] x86: Allow configuring with --with-arch_64=x86-64-v[234]

2020-10-21 Thread Florian Weimer
* Jakub Jelinek:

> On Wed, Sep 30, 2020 at 06:06:31PM +0200, Florian Weimer wrote:
>> --- a/gcc/common/config/i386/i386-common.c
>> +++ b/gcc/common/config/i386/i386-common.c
>> @@ -1795,9 +1795,13 @@ const pta processor_alias_table[] =
>>  PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR, 0, P_NONE},
>>{"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
>>  PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR, 0, P_NONE},
>> -  {"x86-64", PROCESSOR_K8, CPU_K8,
>> -PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR,
>> -0, P_NONE},
>> +  {"x86-64", PROCESSOR_K8, CPU_K8, PTA_X86_64_BASELINE, 0, P_NONE},
>> +  {"x86-64-v2", PROCESSOR_K8, CPU_GENERIC, PTA_X86_64_V2 | PTA_NO_TUNE,
>> +   0, P_NONE},
>> +  {"x86-64-v3", PROCESSOR_K8, CPU_GENERIC, PTA_X86_64_V3 | PTA_NO_TUNE,
>> +   0, P_NONE},
>> +  {"x86-64-v4", PROCESSOR_K8, CPU_GENERIC, PTA_X86_64_V4 | PTA_NO_TUNE,
>> +   0, P_NONE},
>>{"eden-x2", PROCESSOR_K8, CPU_K8,
>>  PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR,
>>  0, P_NONE},
>
> I have noticed that one can't configure gcc to default to these.
>
> I've also found various other 32-bit or 64-bit -march= arguments for which
> it wasn't possible to configure gcc to default to those.
>
> The x86-64-v* the patch only allows in --with-arch_64=, because otherwise
> it fails build miserably - as
> ./xgcc -B ./ -S -march=x86-64-v2 -m32 test.c
> cc1: error: ‘x86-64-v2’ architecture level is only defined for the x86-64 
> psABI
> when building 32-bit multilibs.  Even if multilibs are disallowed, I think
> the compiler still supports -m32 and so --with-arch_64= seems to be the only
> option in which we can support that.

Yes, that was certainly my intent.


[PATCH] x86: Allow configuring with --with-arch_64=x86-64-v[234]

2020-10-21 Thread Jakub Jelinek via Gcc-patches
On Wed, Sep 30, 2020 at 06:06:31PM +0200, Florian Weimer wrote:
> --- a/gcc/common/config/i386/i386-common.c
> +++ b/gcc/common/config/i386/i386-common.c
> @@ -1795,9 +1795,13 @@ const pta processor_alias_table[] =
>  PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR, 0, P_NONE},
>{"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
>  PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR, 0, P_NONE},
> -  {"x86-64", PROCESSOR_K8, CPU_K8,
> -PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR,
> -0, P_NONE},
> +  {"x86-64", PROCESSOR_K8, CPU_K8, PTA_X86_64_BASELINE, 0, P_NONE},
> +  {"x86-64-v2", PROCESSOR_K8, CPU_GENERIC, PTA_X86_64_V2 | PTA_NO_TUNE,
> +   0, P_NONE},
> +  {"x86-64-v3", PROCESSOR_K8, CPU_GENERIC, PTA_X86_64_V3 | PTA_NO_TUNE,
> +   0, P_NONE},
> +  {"x86-64-v4", PROCESSOR_K8, CPU_GENERIC, PTA_X86_64_V4 | PTA_NO_TUNE,
> +   0, P_NONE},
>{"eden-x2", PROCESSOR_K8, CPU_K8,
>  PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR,
>  0, P_NONE},

I have noticed that one can't configure gcc to default to these.

I've also found various other 32-bit or 64-bit -march= arguments for which
it wasn't possible to configure gcc to default to those.

The x86-64-v* the patch only allows in --with-arch_64=, because otherwise
it fails build miserably - as
./xgcc -B ./ -S -march=x86-64-v2 -m32 test.c
cc1: error: ‘x86-64-v2’ architecture level is only defined for the x86-64 psABI
when building 32-bit multilibs.  Even if multilibs are disallowed, I think
the compiler still supports -m32 and so --with-arch_64= seems to be the only
option in which we can support that.

Ok for trunk if this passes bootstrap/regtest?  So far I've just tested that
e.g. --with-tune=x86-64-v3 build fails as expected and --with-arch_64=x86-64-v3
one went fine.

2020-10-21  Jakub Jelinek  

* config.gcc (x86_archs): Add samuel-2, nehemiah, c7 and esther.
(x86_64_archs): Add eden-x2, nano, nano-1000, nano-2000, nano-3000,
nano-x2, eden-x4, nano-x4, x86-64-v2, x86-64-v3 and x86-64-v4.
(i[34567]86-*-* | x86_64-*-*): Only allow x86-64-v* as argument
to --with-arch_64=.

--- gcc/config.gcc.jj   2020-10-15 09:04:50.614521860 +0200
+++ gcc/config.gcc  2020-10-21 17:03:10.396077993 +0200
@@ -662,7 +662,8 @@ tm_defines="$tm_defines LIBC_GLIBC=1 LIB
 x86_archs="athlon athlon-4 athlon-fx athlon-mp athlon-tbird \
 athlon-xp k6 k6-2 k6-3 geode c3 c3-2 winchip-c6 winchip2 i386 i486 \
 i586 i686 pentium pentium-m pentium-mmx pentium2 pentium3 pentium3m \
-pentium4 pentium4m pentiumpro prescott lakemont"
+pentium4 pentium4m pentiumpro prescott lakemont samuel-2 nehemiah \
+c7 esther"
 
 # 64-bit x86 processors supported by --with-arch=.  Each processor
 # MUST be separated by exactly one space.
@@ -672,7 +673,8 @@ opteron-sse3 nocona core2 corei7 corei7-
 slm nehalem westmere sandybridge ivybridge haswell broadwell bonnell \
 silvermont knl knm skylake-avx512 cannonlake icelake-client icelake-server \
 skylake goldmont goldmont-plus tremont cascadelake tigerlake cooperlake \
-sapphirerapids alderlake x86-64 native"
+sapphirerapids alderlake eden-x2 nano nano-1000 nano-2000 nano-3000 \
+nano-x2 eden-x4 nano-x4 x86-64 x86-64-v2 x86-64-v3 x86-64-v4 native"
 
 # Additional x86 processors supported by --with-cpu=.  Each processor
 # MUST be separated by exactly one space.
@@ -4458,6 +4460,17 @@ case "${target}" in
if test x${val} != x; then
case " $x86_64_archs " in
*" ${val} "*)
+   # Disallow x86-64-v* for 
--with-cpu=/--with-tune=
+   # or --with-arch= or 
--with-arch_32=
+   # It can be only specified in 
--with-arch_64=
+   case "x$which$val" in
+   
xcpu*x86-64-v*|xtune*x86-64-v*|xarchx86-64-v*|xarch_32x86-64-v*)
+   echo "Unknown CPU given 
in --with-$which=$val." 1>&2
+   exit 1
+   ;;
+   *)
+   ;;
+   esac
# OK
;;
*)

Jakub



Increase inlining limits for inline functions with builtin_constant_p on parameter

2020-10-21 Thread Jan Hubicka
Hi,
this patch implements heuristics that increases inline limits (by the hints
mechanism) for inline functions that use builtin_constant_p on parameter. Those
are very likely intended to be always inlined and simplify after inlining.

The PR is about a function that we used to inline with
 --param inline-insns-single=200 but with new default of 70 for -O2 we no longer
do so.  Hints are currently configured to bump the bound up twice, so we
get limit of 140 that is still not enough to inline the particular testcase
but it should help in general.  I can implement a stronger bump if that seems
useful (maybe it is). The example is bit operation written as a decision chain
with 64 conditions:

return ( __builtin_constant_p((size) - 1) ? ( __builtin_constant_p((size) - 1)
? ( ((size) - 1) < 2 ? 0 : ((size) - 1) & (1ULL << 63) ? 63 : ((size) - 1) &
(1ULL << 62) ? 62 : ((size) - 1) & (1ULL << 61) ? 61 : ((size) - 1) & (1ULL <<
60) ? 60 : ((size) - 1) & (1ULL << 59) ? 59 : ((size) - 1) & (1ULL << 58) ? 58
: ((size) - 1) & (1ULL << 57) ? 57 : ((size) - 1) & (1ULL << 56) ? 56 : ((size)
- 1) & (1ULL << 55) ? 55 : ((size) - 1) & (1ULL << 54) ? 54 : ((size) - 1) &
(1ULL << 53) ? 53 : ((size) - 1) & (1ULL << 52) ? 52 : ((size) - 1) & (1ULL <<
51) ? 51 : ((size) - 1) & (1ULL << 50) ? 50 : ((size) - 1) & (1ULL << 49) ? 49
: ((size) - 1) & (1ULL << 48) ? 48 : ((size) - 1) & (1ULL << 47) ? 47 : ((size)
- 1) & (1ULL << 46) ? 46 : ((size) - 1) & (1ULL << 45) ? 45 : ((size) - 1) &
(1ULL << 44) ? 44 : ((size) - 1) & (1ULL << 43) ? 43 : ((size) - 1) & (1ULL <<
42) ? 42 : ((size) - 1) & (1ULL << 41) ? 41 : ((size) - 1) & (1ULL << 40) ? 40
: ((size) - 1) & (1ULL << 39) ? 39 : ((size) - 1) & (1ULL << 38) ? 38 : ((size)
- 1) & (1ULL << 37) ? 37 : ((size) - 1) & (1ULL << 36) ? 36 : ((size) - 1) &
(1ULL << 35) ? 35 : ((size) - 1) & (1ULL << 34) ? 34 : ((size) - 1) & (1ULL <<
33) ? 33 : ((size) - 1) & (1ULL << 32) ? 32 : ((size) - 1) & (1ULL << 31) ? 31
: ((size) - 1) & (1ULL << 30) ? 30 : ((size) - 1) & (1ULL << 29) ? 29 : ((size)
- 1) & (1ULL << 28) ? 28 : ((size) - 1) & (1ULL << 27) ? 27 : ((size) - 1) &
(1ULL << 26) ? 26 : ((size) - 1) & (1ULL << 25) ? 25 : ((size) - 1) & (1ULL <<
24) ? 24 : ((size) - 1) & (1ULL << 23) ? 23 : ((size) - 1) & (1ULL << 22) ? 22
: ((size) - 1) & (1ULL << 21) ? 21 : ((size) - 1) & (1ULL << 20) ? 20 : ((size)
- 1) & (1ULL << 19) ? 19 : ((size) - 1) & (1ULL << 18) ? 18 : ((size) - 1) &
(1ULL << 17) ? 17 : ((size) - 1) & (1ULL << 16) ? 16 : ((size) - 1) & (1ULL <<
15) ? 15 : ((size) - 1) & (1ULL << 14) ? 14 : ((size) - 1) & (1ULL << 13) ? 13
: ((size) - 1) & (1ULL << 12) ? 12 : ((size) - 1) & (1ULL << 11) ? 11 : ((size)
- 1) & (1ULL << 10) ? 10 : ((size) - 1) & (1ULL << 9) ? 9 : ((size) - 1) &
(1ULL << 8) ? 8 : ((size) - 1) & (1ULL << 7) ? 7 : ((size) - 1) & (1ULL << 6) ?
6 : ((size) - 1) & (1ULL << 5) ? 5 : ((size) - 1) & (1ULL << 4) ? 4 : ((size) -
1) & (1ULL << 3) ? 3 : ((size) - 1) & (1ULL << 2) ? 2 : 1) : -1) :
(sizeof((size) - 1) <= 4) ? __ilog2_u32((size) - 1) : __ilog2_u64((size) - 1) )
- 12 + 1;

This blows up the limit on number of conditions we track per funtion (which is
30) and thus the size/time estimates are not working that well.

Bootstrapped/regtsted x86_64-linux, will commit it after bit more testing.

gcc/ChangeLog:

2020-10-21  Jan Hubicka  

PR ipa/97445
* ipa-fnsummary.c (ipa_dump_hints): Add INLINE_HINT_builtin_constant_p.
(ipa_fn_summary::~ipa_fn_summary): Free builtin_constant_p_parms.
(ipa_fn_summary_t::duplicate): Duplicate builtin_constant_p_parms.
(ipa_dump_fn_summary): Dump builtin_constant_p_parms.
(add_builtin_constant_p_parm): New function
(set_cond_stmt_execution_predicate): Update builtin_constant_p_parms.
(ipa_call_context::estimate_size_and_time): Set 
INLINE_HINT_builtin_constant_p..
(ipa_merge_fn_summary_after_inlining): Merge builtin_constant_p_parms.
(inline_read_section): Read builtin_constant_p_parms.
(ipa_fn_summary_write): Write builtin_constant_p_parms.
* ipa-fnsummary.h (enum ipa_hints_vals): Add
INLINE_HINT_builtin_constant_p.
* ipa-inline.c (want_inline_small_function_p): Use
INLINE_HINT_builtin_constant_p.
(edge_badness): Use INLINE_HINT_builtin_constant_p.

gcc/testsuite/ChangeLog:

2020-10-21  Jan Hubicka  

PR ipa/97445
* gcc.dg/ipa/inlinehint-5.c: New test.

diff --git a/gcc/ipa-fnsummary.c b/gcc/ipa-fnsummary.c
index 9e3eda4d3cb..eb7467a8d52 100644
--- a/gcc/ipa-fnsummary.c
+++ b/gcc/ipa-fnsummary.c
@@ -141,6 +141,11 @@ ipa_dump_hints (FILE *f, ipa_hints hints)
   hints &= ~INLINE_HINT_known_hot;
   fprintf (f, " known_hot");
 }
+  if (hints & INLINE_HINT_builtin_constant_p)
+{
+  hints &= ~INLINE_HINT_builtin_constant_p;
+  fprintf (f, " builtin_constant_p");
+}
   gcc_assert (!hints);
 }
 
@@ -751,6 +756,7 @@ ipa_fn_summary::~ipa_fn_summary ()
   vec_free 

[patch] vxworks: Remove interfering default #undefs from vx-common.h

2020-10-21 Thread Olivier Hainque

This patch removes the #undef issued for LIB_SPEC and LINK_SPEC
in vx-common.h, which all the ports do on their own and which
impairs the bi-arch’d ppc*-vx7r2 targets, relying on linux64
definitions.

Tested together with the previous changes posted for the
newly introduced powerpc ports.

Committing to mainline shortly.

Olivier

2020-10-21  Douglas Rupp  

gcc/
* config/vx-common.h (LINK_SPEC, LIB_SPEC): Remove #undef.


--- a/gcc/config/vx-common.h
+++ b/gcc/config/vx-common.h
@@ -23,8 +23,6 @@ along with GCC; see the file COPYING3.  If not see
 /* Most of these will probably be overridden by subsequent headers.  We
undefine them here just in case, and define VXWORKS_ versions of each,
to be used in port-specific vxworks.h.  */
-#undef LIB_SPEC
-#undef LINK_SPEC
 #undef LIBGCC_SPEC
 #define LIBGCC_SPEC VXWORKS_LIBGCC_SPEC
 #undef STARTFILE_SPEC


Re: [PATCH 1/2] [target 87767] Refactor AVX512 broadcast patterns with speical memory constraint.

2020-10-21 Thread Vladimir Makarov via Gcc-patches



On 2020-10-20 10:11 p.m., Hongtao Liu wrote:


Changed, and it passed the i386/x86-64 regression test.

Update patch.


Thank you, Hongtao.  This patch is ok for the trunk.




[PATCH] SLP: Move load/store-lanes check till late

2020-10-21 Thread Tamar Christina via Gcc-patches
Hi All,

This moves the code that checks for load/store lanes further in the pipeline and
places it after slp_optimize.  This would allow us to perform optimizations on
the SLP tree and only bail out if we really have a permute.

With this change it allows us to handle permutes such as {1,1,1,1} which should
be handled by a load and replicate.

This change however makes it all or nothing. Either all instances can be handled
or none at all.  This is why some of the test cases have been adjusted.

Bootstrapped Regtested on aarch64-none-linux-gnu, -x86_64-pc-linux-gnu
 and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* tree-vect-slp.c (vect_analyze_slp_instance): Moved load/store lanes
check to ...
* tree-vect-loop.c (vect_analyze_loop_2): ..Here

gcc/testsuite/ChangeLog:

* gcc.dg/vect/slp-11b.c: Update output scan.
* gcc.dg/vect/slp-perm-6.c: Likewise.

-- 
diff --git a/gcc/testsuite/gcc.dg/vect/slp-11b.c b/gcc/testsuite/gcc.dg/vect/slp-11b.c
index 0cc23770badf0e00ef98769a2dd14a92dca32cca..fe5bb0c3ce7682c7cef1313e342d95aba3fe11b2 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-11b.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-11b.c
@@ -45,4 +45,4 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_strided4 && vect_int_mult } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { ! { vect_strided4 && vect_int_mult } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
+/* { dg-final { scan-tree-dump-times "re-trying with SLP disabled" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-6.c b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c
index 38489291a2659c989121d44c9e9e7bdfaa12f868..07bf8916de7ce88bbb1d65437f8bf6d8ab17efe6 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-6.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c
@@ -106,7 +106,7 @@ int main (int argc, const char* argv[])
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_perm3_int && { {! vect_load_lanes } && {! vect_partial_vectors_usage_1 } } } } } } */
 /* The epilogues are vectorized using partial vectors.  */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target { vect_perm3_int && { {! vect_load_lanes } && vect_partial_vectors_usage_1 } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
 /* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */
 /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
 /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 991fd4572298448c5d074f87a4ed318f0a3c9db6..c1350a8008850ea5e21a27cacd7e340d0da9bc9c 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2342,6 +2342,60 @@ start_over:
    "unsupported SLP instances\n");
 	  goto again;
 	}
+
+  /* Check whether any load is possibly permuted.  */
+  slp_tree load_node, slp_root;
+  unsigned i, x;
+  slp_instance instance;
+  FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
+	{
+	  bool loads_permuted = false;
+	  slp_root = SLP_INSTANCE_TREE (instance);
+	  int group_size = SLP_TREE_LANES (slp_root);
+	  tree vectype = SLP_TREE_VECTYPE (slp_root);
+
+	  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
+	{
+	  if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
+		continue;
+	  unsigned j;
+	  stmt_vec_info load_info;
+	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
+		if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
+		  {
+		loads_permuted = true;
+		break;
+		  }
+	}
+
+	  /* If the loads and stores can be handled with load/store-lane
+	 instructions do not generate this SLP instance.  */
+	  if (loads_permuted
+	  && vect_store_lanes_supported (vectype, group_size, false))
+	{
+	  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
+		{
+		  stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
+		  (SLP_TREE_SCALAR_STMTS (load_node)[0]);
+		  /* Use SLP for strided accesses (or if we can't
+		 load-lanes).  */
+		  if (STMT_VINFO_STRIDED_P (stmt_vinfo)
+		  || ! vect_load_lanes_supported
+			(STMT_VINFO_VECTYPE (stmt_vinfo),
+			 DR_GROUP_SIZE (stmt_vinfo), false))
+		break;
+		}
+
+	  if (i == SLP_INSTANCE_LOADS (instance).length ())
+		{
+		ok = opt_result::failure_at (vect_location,
+		 "Built SLP cancelled: can use"
+		 " load/store-lanes\n");
+		  goto again;
+		}
+	}
+	}
+
 }
 
   /* Dissolve SLP-only groups.  */
diff --git 

Re: [PATCH][middle-end][i386][version 3]Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-21 Thread Qing Zhao via Gcc-patches



> On Oct 21, 2020, at 3:03 AM, Uros Bizjak  wrote:
> 
> On Wed, Oct 21, 2020 at 9:18 AM Uros Bizjak  > wrote:
>> 
>> On Tue, Oct 20, 2020 at 10:04 PM Qing Zhao  wrote:
>> 
>>> +/* Check whether the register REGNO should be zeroed on X86.
>>> +   When ALL_SSE_ZEROED is true, all SSE registers have been zeroed
>>> +   together, no need to zero it again.
>>> +   Stack registers (st0-st7) and mm0-mm7 are aliased with each other.
>>> +   very hard to be zeroed individually, don't zero individual st or
>>> +   mm registgers at this time.  */
>>> +
>>> +static bool
>>> +zero_call_used_regno_p (const unsigned int regno,
>>> + bool all_sse_zeroed)
>>> +{
>>> +  return GENERAL_REGNO_P (regno)
>>> +  || (!all_sse_zeroed && SSE_REGNO_P (regno))
>>> +  || MASK_REGNO_P (regno);
>>> +}
>>> +
>>> +/* Return the machine_mode that is used to zero register REGNO.  */
>>> +
>>> +static machine_mode
>>> +zero_call_used_regno_mode (const unsigned int regno)
>>> +{
>>> +  /* NB: We only need to zero the lower 32 bits for integer registers
>>> + and the lower 128 bits for vector registers since destination are
>>> + zero-extended to the full register width.  */
>>> +  if (GENERAL_REGNO_P (regno))
>>> +return SImode;
>>> +  else if (SSE_REGNO_P (regno))
>>> +return V4SFmode;
>>> +  else
>>> +return HImode;
>>> +}
>>> +
>>> +/* Generate a rtx to zero all vector registers togetehr if possible,
>>> +   otherwise, return NULL.  */
>>> +
>>> +static rtx
>>> +zero_all_vector_registers (HARD_REG_SET need_zeroed_hardregs)
>>> +{
>>> +  if (!TARGET_AVX)
>>> +return NULL;
>>> +
>>> +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
>>> +if ((IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG)
>>> +  || (TARGET_64BIT
>>> +  && (REX_SSE_REGNO_P (regno)
>>> +  || (TARGET_AVX512F && EXT_REX_SSE_REGNO_P (regno)
>>> + && !TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
>>> +  return NULL;
>>> +
>>> +  return gen_avx_vzeroall ();
>>> +}
>>> +
>>> +/* Generate a rtx to zero all st and mm registers togetehr if possible,
>>> +   otherwise, return NULL.  */
>>> +
>>> +static rtx
>>> +zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs)
>>> +{
>>> +  if (!TARGET_MMX)
>>> +return NULL;
>>> +
>>> +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
>>> +if ((STACK_REGNO_P (regno) || MMX_REGNO_P (regno))
>>> + && !TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
>>> +  return NULL;
>>> +
>>> +  return gen_mmx_emms ();
>>> 
>>> 
>>> emms is not clearing any register, it only loads x87FPUTagWord with
>>> H. So I think, the above is useless, as far as register clearing
>>> is concerned.
>>> 
>>> 
>>> Thanks for the info.
>>> 
>>> So, for mm and st registers, should we clear them, and how?
>>> 
>>> 
>>> I don't know.
>>> 
>>> Please note that %mm and %st share the same register file, and
>>> touching %mm registers will block access to %st until emms is emitted.
>>> You can't just blindly load 0 to %st registers, because the register
>>> file can be in MMX mode and vice versa. For 32bit targets, function
>>> can also  return a value in the %mm0.
>>> 
>>> 
>>> If data flow determine that %mm0 does not return a value at the return, can 
>>> we clear all the %st as following:
>>> 
>>> emms
>>> mov %st0, 0
>>> mov %st1, 0
>>> mov %st2, 0
>>> mov %st3, 0
>>> mov %st4, 0
>>> mov %st5, 0
>>> mov %st6, 0
>>> mov %st7, 0
>> 
>> The i386 ABI says:
>> 
>> -- q --
>> The CPU shall be in x87 mode upon entry to a function. Therefore,
>> every function that uses the MMX registers is required to issue an
>> emms or femms instruction after using MMX registers, before returning
>> or calling another function.
>> -- /q --
>> 
>> (The above requirement slightly contradicts its own ABI, since we have
>> 3 MMX argument registers and MMX return register, so the CPU obviously
>> can't be in x87 mode at all function boundaries).
>> 
>> So, assuming that the first sentence is not deliberately vague w.r.t
>> function exit, emms should not be needed. However, we are dealing with
>> x87 stack registers that have their own set of peculiarities. It is
>> not possible to load a random register in the way you show.  Also,
>> stack should be either empty or one (two in case of complex value
>> return) levels deep at the function return. I think you want a series
>> of 8 or 7(6) fldz insns, followed by a series of fstp insn to clear
>> the stack and mark stack slots empty.
> 
> Something like this:
> 
> --cut here--
> long double
> __attribute__ ((noinline))
> test (long double a, long double b)
> {
>  long double r = a + b;
> 
>  asm volatile ("fldz;\
>fldz;\
>fldz;\
>fldz;\
>fldz;\
>fldz;\
>fldz;\
>fstp %%st(0);\
>fstp %%st(0);\
>fstp %%st(0);\
> 

[committed] libstdc++: Make structured bindings always work for subranges [PR 97512]

2020-10-21 Thread Jonathan Wakely via Gcc-patches
The definition of ranges::subrange was moved to the new
 header so that it could be used in 
without including the whole of . However, the tuple-like support
that enables subrange to be used with structured bindings was left in
. This is arguably conforming (to use a subrange you should
include ) but it's inconvenient and probably confusing.

This change makes the tuple-like support available whenever subrange
itself is available.

libstdc++-v3/ChangeLog:

PR libstdc++/97512
* include/bits/ranges_util.h (tuple_size)
(tuple_element): Move here from ...
* include/std/ranges: ... here.
* testsuite/std/ranges/subrange/97512.cc: New test.

Tested powerpc64le-linux. Committed to trunk.

commit a186ab670e97c4c3883d96506655c4621e7c5515
Author: Jonathan Wakely 
Date:   Wed Oct 21 14:40:54 2020

libstdc++: Make structured bindings always work for subranges [PR 97512]

The definition of ranges::subrange was moved to the new
 header so that it could be used in 
without including the whole of . However, the tuple-like support
that enables subrange to be used with structured bindings was left in
. This is arguably conforming (to use a subrange you should
include ) but it's inconvenient and probably confusing.

This change makes the tuple-like support available whenever subrange
itself is available.

libstdc++-v3/ChangeLog:

PR libstdc++/97512
* include/bits/ranges_util.h (tuple_size)
(tuple_element): Move here from ...
* include/std/ranges: ... here.
* testsuite/std/ranges/subrange/97512.cc: New test.

diff --git a/libstdc++-v3/include/bits/ranges_util.h 
b/libstdc++-v3/include/bits/ranges_util.h
index a98658ff5c8..cc50e2ad4e4 100644
--- a/libstdc++-v3/include/bits/ranges_util.h
+++ b/libstdc++-v3/include/bits/ranges_util.h
@@ -410,6 +410,27 @@ namespace ranges
 
   using ranges::get;
 
+  template
+struct tuple_size>
+: integral_constant
+{ };
+
+  template
+struct tuple_element<0, ranges::subrange<_Iter, _Sent, _Kind>>
+{ using type = _Iter; };
+
+  template
+struct tuple_element<1, ranges::subrange<_Iter, _Sent, _Kind>>
+{ using type = _Sent; };
+
+  template
+struct tuple_element<0, const ranges::subrange<_Iter, _Sent, _Kind>>
+{ using type = _Iter; };
+
+  template
+struct tuple_element<1, const ranges::subrange<_Iter, _Sent, _Kind>>
+{ using type = _Sent; };
+
 _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace std
 #endif // library concepts
diff --git a/libstdc++-v3/include/std/ranges b/libstdc++-v3/include/std/ranges
index b03ae399fa6..4d3a4940c6f 100644
--- a/libstdc++-v3/include/std/ranges
+++ b/libstdc++-v3/include/std/ranges
@@ -3320,27 +3320,6 @@ namespace views
 
   namespace views = ranges::views;
 
-  template
-struct tuple_size>
-: integral_constant
-{ };
-
-  template
-struct tuple_element<0, ranges::subrange<_Iter, _Sent, _Kind>>
-{ using type = _Iter; };
-
-  template
-struct tuple_element<1, ranges::subrange<_Iter, _Sent, _Kind>>
-{ using type = _Sent; };
-
-  template
-struct tuple_element<0, const ranges::subrange<_Iter, _Sent, _Kind>>
-{ using type = _Iter; };
-
-  template
-struct tuple_element<1, const ranges::subrange<_Iter, _Sent, _Kind>>
-{ using type = _Sent; };
-
 _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace
 #endif // library concepts
diff --git a/libstdc++-v3/testsuite/std/ranges/subrange/97512.cc 
b/libstdc++-v3/testsuite/std/ranges/subrange/97512.cc
new file mode 100644
index 000..b0fd0e2577f
--- /dev/null
+++ b/libstdc++-v3/testsuite/std/ranges/subrange/97512.cc
@@ -0,0 +1,33 @@
+// Copyright (C) 2020 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// .
+
+// { dg-options "-std=gnu++2a" }
+// { dg-do compile { target c++2a } }
+
+// PR libstdc++/97512
+// Check that structured bindings work for subranges without .
+#include 
+
+constexpr bool
+test01()
+{
+  int r[] = { 1, 2, 2, 3, 3, 3 };
+  auto [first, last] = std::ranges::unique(r);
+  return first == std::ranges::begin(r) + 3 && last == std::ranges::end(r);
+}
+
+static_assert( test01() );


[Patch, committed] Fortran: class.c - update vtable comment

2020-10-21 Thread Tobias Burnus

I was confused as I saw _deallocate – but didn't show up
in the big comment at the beginning of class.c.
Hence, I added it.

Committed as r11-4186-g310fe80babe04ccb7d2e15c8fca7dc98180701a8
but if you have have follow-up suggestions, we can surely change it.

Tobias

-
Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München / Germany
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Alexander 
Walter
commit 310fe80babe04ccb7d2e15c8fca7dc98180701a8
Author: Tobias Burnus 
Date:   Wed Oct 21 14:38:44 2020 +0200

Fortran: class.c - update vtable comment

gcc/fortran/
PR fortran/45516
* class.c: Add _deallocate to the vtable documentation
comment.

diff --git a/gcc/fortran/class.c b/gcc/fortran/class.c
index dfa48400712..5677d920239 100644
--- a/gcc/fortran/class.c
+++ b/gcc/fortran/class.c
@@ -49,6 +49,8 @@ along with GCC; see the file COPYING3.  If not see
 * _copy: A procedure pointer to a copying procedure.
 * _final:A procedure pointer to a wrapper function, which frees
 		 allocatable components and calls FINAL subroutines.
+* _deallocate: A procedure pointer to a deallocation procedure; nonnull
+		 only for a recursive derived type.
 
After these follow procedure pointer components for the specific
type-bound procedures.  */


[RFC2][PATCH] SLP vectorize across PHI nodes

2020-10-21 Thread Richard Biener
This enables SLP build to handle PHI nodes in full, continuing
the SLP build to non-backedges.  For loop vectorization this
enables outer loop vectorization of nested SLP cycles and for
BB vectorization this enables vectorization of PHIs at CFG merges.

Vectorized backedge defs are now filled using this info which
requires sanitizing the SLP tree for SLP reduction chains even
more, manually filling the backedge SLP def.

This also exposes the fact that CFG copying (and edge splitting
until I fixed that) ends up with different edge order in the
copy which doesn't play well with the desired 1:1 mapping of
SLP PHI node children and edges for epilogue vectorization.
I've tried to fixup CFG copying here but this really looks
like a dead (or expensive) end there so I've done fixup in
slpeel_tree_duplicate_loop_to_edge_cfg instead for the cases
we can run into.

There's still NULLs in the SLP_TREE_CHILDREN vectors and I'm
not sure it's possible to eliminate them all so the patch
has quite some checks for this case all over the place.

Bootstrapped and tested  on x86_64-unknown-linux-gnu.

I still have to track down two SPEC 2k6 build ICEs with the patch,
but otherwise it would have been ready.

Richard.

2020-10-21  Richard Biener  

* gimple.h (gimple_expr_type): For PHIs return the type
of the result.
* tree-vect-loop-manip.c (slpeel_tree_duplicate_loop_to_edge_cfg):
Make sure edge order into copied loop headers line up with the
originals.
* tree-vect-loop.c (vect_transform_cycle_phi): Handle nested
loops with SLP.
(vectorizable_phi): New function.
(vectorizable_live_operation): For BB vectorization compute insert
location here.
* tree-vect-slp.c (vect_free_slp_tree): Deal with NULL
SLP_TREE_CHILDREN entries.
(vect_print_slp_graph): Likewise.
(vect_mark_slp_stmts): Likewise.
(vect_mark_slp_stmts_relevant): Likewise.
(vect_gather_slp_loads): Likewise.
(vect_optimize_slp): Likewise.
(vect_slp_analyze_node_operations): Likewise.
(vect_bb_slp_scalar_cost): Likewise.
(vect_remove_slp_scalar_calls): Likewise.
(vect_get_and_check_slp_defs): Handle PHIs and mark backedge
defs.
(vect_build_slp_tree_1): Handle PHIs.
(vect_build_slp_tree_2): Continue SLP build, following PHI
arguments.
(vect_analyze_slp_instance): Set the backedge SLP def for
reduction chains.
(vect_analyze_slp_backedges): Skip already set backedges,
set the SLP child corresponding to the edge.
(vect_slp_build_vertices): Adjust leaf condition.
(vect_bb_slp_mark_live_stmts): Handle PHIs.
(vect_bb_partition_graph_r): Likewise.
(vect_slp_function): Adjust split condition to allow CFG
merges.
(vect_schedule_slp_instance): Adjust.
(vect_fill_vectorized_backedge_defs): New function.
(vect_schedule_slp): Call it.  Remove ad-hoc vectorized
backedge fill code.
* tree-vect-stmts.c (vect_analyze_stmt): Call
vectorizable_phi.
(vect_transform_stmt): Likewise.
(vect_is_simple_use): Handle vect_backedge_def.
* tree-vectorizer.c (vec_info::new_stmt_vec_info): Only
set loop header PHIs to vect_unknown_def_type for loop
vectorization.
* tree-vectorizer.h (enum vect_def_type): Add vect_backedge_def.
(enum stmt_vec_info_type): Add phi_info_type.
(vectorizable_phi): Declare.

* gcc.dg/vect/bb-slp-54.c: New test.
* gcc.dg/vect/vect-outer-slp-1.c: New test.
---
 gcc/gimple.h |   2 +
 gcc/testsuite/gcc.dg/vect/bb-slp-54.c|  23 ++
 gcc/testsuite/gcc.dg/vect/vect-outer-slp-1.c |  31 ++
 gcc/tree-vect-loop-manip.c   |  27 ++
 gcc/tree-vect-loop.c | 108 +-
 gcc/tree-vect-slp.c  | 378 ---
 gcc/tree-vect-stmts.c|  11 +-
 gcc/tree-vectorizer.c|   3 +-
 gcc/tree-vectorizer.h|   3 +
 9 files changed, 442 insertions(+), 144 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-54.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-outer-slp-1.c

diff --git a/gcc/gimple.h b/gcc/gimple.h
index 3c9b9965f5a..87c90be9a6a 100644
--- a/gcc/gimple.h
+++ b/gcc/gimple.h
@@ -6598,6 +6598,8 @@ gimple_expr_type (const gimple *stmt)
 }
   else if (code == GIMPLE_COND)
 return boolean_type_node;
+  else if (code == GIMPLE_PHI)
+return TREE_TYPE (gimple_phi_result (stmt));
   else
 return void_type_node;
 }
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-54.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-54.c
new file mode 100644
index 000..d05ce33310d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-54.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target 

Re: PING [PATCH] Enable GCC support for Intel Key Locker extension

2020-10-21 Thread Uros Bizjak via Gcc-patches
On Wed, Oct 21, 2020 at 1:48 PM Uros Bizjak  wrote:
>
> On Wed, Oct 21, 2020 at 11:11 AM Hongyu Wang  wrote:
> >
> > Hi,
> >
> > > IIRC, adding a new regclass is O(n^2), so it should be avoided. I
> > > think that the new patterns should follow the same path as vzeroall
> > > and vzeroupper patterns, where we emit the pattern with explicit hard
> > > regs.
> > >
> > > BTW: We do have SSE_FIRST_REG class, but this class was added to solve
> > > some reload problems in the past by marking %xmm0 as likely spilled.
> >
> > Thanks for your suggestion, we have removed the register classes and 
> > constraints, and
> > set explicit sse hard registers in the expander. The corresponding patterns 
> > are also adjusted,
> >
> > Update and rebased patch.
>
> The attached patch goes only half-way to using explicit registers. As
> said previously, please see how avx_vzeroall expander is generating
> its insn pattern, and how *avx_vzeroall matches the generated pattern
> using "vzeroall_operation" predicate.

For example:

+(define_insn "encodekey128u32"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+(unspec_volatile:SI
+  [(match_operand:SI   1 "register_operand" "r")
+   (match_operand:V2DI 3 "register_operand" "2")]
+ UNSPECV_ENCODEKEY128U32))

should be generated as:

(parallel [
  (set ( ... as above ... )
(unspec_volatile:SI [( ... as above ... ) ( reg:V2DI 20 xmm0 )]
UNSPEC_ENCODEKEY128U32))

followed by the serie of:

   (set (reg:V2DI 20 xmm0)
(unspec_volatile:V2DI [(const_int 0)] UNSPECV_ENCODEKEY128U32))

no need to duplicate already listed input operands in unspec_volatile.

followed by another serie of:

   (set (reg:V2DI 26 xmm6)
(const_vector:V2DI [(const_int 0) (const_int 0)]))

to tell the optimizer that some registers now hold zero, so the value
in the register can eventually be reused elsewhere.

and finish the parallel with clobber of flags_reg.

Another example:

+(define_insn "aesu8"
+  [(set (reg:CCZ FLAGS_REG)
+(unspec_volatile:CCZ [(match_operand:BLK 0 "memory_operand" "m")
+  (match_operand:V2DI 9  "register_operand" "1")
+  (match_operand:V2DI 2  "sse_reg_operand")
+  (match_operand:V2DI 3  "sse_reg_operand")
+  (match_operand:V2DI 4  "sse_reg_operand")
+  (match_operand:V2DI 5  "sse_reg_operand")
+  (match_operand:V2DI 6  "sse_reg_operand")
+  (match_operand:V2DI 7  "sse_reg_operand")
+  (match_operand:V2DI 8  "sse_reg_operand")]
+ AESDECENCWIDEKL))
+   (set (match_operand:V2DI 1 "register_operand" "=Yz")
+(unspec_volatile:V2DI [(const_int 0)] AESDECENCWIDEKL))
+   (set (match_dup 2)
+(unspec_volatile:V2DI [(const_int 0)] AESDECENCWIDEKL))

This should be written as:

parallel [
  (set ( ... as above ... )
(unspec_volatile:CCZ [( ... as above, BLK only ... )]
UNSPEC_AESDECENWIDEKL))

followed by a series of:

   (set (reg:V2DI 20 xmm0)
(unspec_volatile:V2DI [(reg:V2DI 20 xmm0)] UNSPEC_AESDECENCWIDEKL))

And please see the mentioned expander and pattern how the above series
are generated and matched.

Uros.


[PATCH] openmp: Change omp_get_initial_device () to match OpenMP 5.1 requirements

2020-10-21 Thread Jakub Jelinek via Gcc-patches
On Thu, Oct 15, 2020 at 01:02:29PM +0200, Jakub Jelinek via Gcc-patches wrote:
> Therefore, I think until omp_get_initial_device () value is changed, we

The following so far untested patch implements that change.

OpenMP 4.5 said for omp_get_initial_device:
The value of the device number is implementation defined. If it is between 0 
and one less than
omp_get_num_devices() then it is valid for use with all device constructs and 
routines; if it is
outside that range, then it is only valid for use with the device memory 
routines and not in the
device clause.
and OpenMP 5.0 similarly, but OpenMP 5.1 says:
The value of the device number is the value returned by the omp_get_num_devices 
routine.

As the new value is compatible with what has been required earlier, I think
we can change it already now.

2020-10-21  Jakub Jelinek  

* icv.c (omp_get_initial_device): Remove including corresponding
ialias.
* icv-device.c (omp_get_initial_device): New function.  Return
gomp_get_num_devices ().  Add ialias.
* target.c (resolve_device): Don't fail with
OMP_TARGET_OFFLOAD=mandatory if device_id is equal to
gomp_get_num_devices ().
(omp_target_alloc, omp_target_free, omp_target_is_present,
omp_target_memcpy, omp_target_memcpy_rect, omp_target_associate_ptr,
omp_target_disassociate_ptr, omp_pause_resource): Use
gomp_get_num_devices () instead of GOMP_DEVICE_HOST_FALLBACK on the
first use in the functions, in uses dominated by the
gomp_get_num_devices call use num_devices_openmp instead.
* libgomp.texi (omp_get_initial_device): Document.
* config/gcn/icv-device.c (omp_get_initial_device): New function.
Add ialias.
* config/nvptx/icv-device.c (omp_get_initial_device): Likewise.
* testsuite/libgomp.c/target-40.c: New test.

--- libgomp/icv.c.jj2020-10-13 22:29:22.202958364 +0200
+++ libgomp/icv.c   2020-10-21 12:31:05.941289010 +0200
@@ -156,12 +156,6 @@ omp_get_proc_bind (void)
 }
 
 int
-omp_get_initial_device (void)
-{
-  return GOMP_DEVICE_HOST_FALLBACK;
-}
-
-int
 omp_get_num_places (void)
 {
   return gomp_places_list_len;
@@ -241,7 +235,6 @@ ialias (omp_get_max_active_levels)
 ialias (omp_get_supported_active_levels)
 ialias (omp_get_cancellation)
 ialias (omp_get_proc_bind)
-ialias (omp_get_initial_device)
 ialias (omp_get_max_task_priority)
 ialias (omp_get_num_places)
 ialias (omp_get_place_num)
--- libgomp/icv-device.c.jj 2020-01-12 11:54:39.016374137 +0100
+++ libgomp/icv-device.c2020-10-21 12:32:24.827131320 +0200
@@ -43,6 +43,12 @@ omp_get_default_device (void)
 }
 
 int
+omp_get_initial_device (void)
+{
+  return gomp_get_num_devices ();
+}
+
+int
 omp_get_num_devices (void)
 {
   return gomp_get_num_devices ();
@@ -57,5 +63,6 @@ omp_is_initial_device (void)
 
 ialias (omp_set_default_device)
 ialias (omp_get_default_device)
+ialias (omp_get_initial_device)
 ialias (omp_get_num_devices)
 ialias (omp_is_initial_device)
--- libgomp/target.c.jj 2020-10-20 19:51:38.149361531 +0200
+++ libgomp/target.c2020-10-21 12:43:19.336526122 +0200
@@ -118,7 +118,8 @@ resolve_device (int device_id)
   if (device_id < 0 || device_id >= gomp_get_num_devices ())
 {
   if (gomp_target_offload_var == GOMP_TARGET_OFFLOAD_MANDATORY
- && device_id != GOMP_DEVICE_HOST_FALLBACK)
+ && device_id != GOMP_DEVICE_HOST_FALLBACK
+ && device_id != num_devices_openmp)
gomp_fatal ("OMP_TARGET_OFFLOAD is set to MANDATORY, "
"but device not found");
 
@@ -132,8 +133,7 @@ resolve_device (int device_id)
 {
   gomp_mutex_unlock ([device_id].lock);
 
-  if (gomp_target_offload_var == GOMP_TARGET_OFFLOAD_MANDATORY
- && device_id != GOMP_DEVICE_HOST_FALLBACK)
+  if (gomp_target_offload_var == GOMP_TARGET_OFFLOAD_MANDATORY)
gomp_fatal ("OMP_TARGET_OFFLOAD is set to MANDATORY, "
"but device is finalized");
 
@@ -2716,7 +2716,7 @@ GOMP_teams (unsigned int num_teams, unsi
 void *
 omp_target_alloc (size_t size, int device_num)
 {
-  if (device_num == GOMP_DEVICE_HOST_FALLBACK)
+  if (device_num == gomp_get_num_devices ())
 return malloc (size);
 
   if (device_num < 0)
@@ -2742,7 +2742,7 @@ omp_target_free (void *device_ptr, int d
   if (device_ptr == NULL)
 return;
 
-  if (device_num == GOMP_DEVICE_HOST_FALLBACK)
+  if (device_num == gomp_get_num_devices ())
 {
   free (device_ptr);
   return;
@@ -2773,7 +2773,7 @@ omp_target_is_present (const void *ptr,
   if (ptr == NULL)
 return 1;
 
-  if (device_num == GOMP_DEVICE_HOST_FALLBACK)
+  if (device_num == gomp_get_num_devices ())
 return 1;
 
   if (device_num < 0)
@@ -2807,7 +2807,7 @@ omp_target_memcpy (void *dst, const void
   struct gomp_device_descr *dst_devicep = NULL, *src_devicep = NULL;
   bool ret;
 
-  if (dst_device_num != GOMP_DEVICE_HOST_FALLBACK)
+  if (dst_device_num != 

Re: PING [PATCH] Enable GCC support for Intel Key Locker extension

2020-10-21 Thread Uros Bizjak via Gcc-patches
On Wed, Oct 21, 2020 at 11:11 AM Hongyu Wang  wrote:
>
> Hi,
>
> > IIRC, adding a new regclass is O(n^2), so it should be avoided. I
> > think that the new patterns should follow the same path as vzeroall
> > and vzeroupper patterns, where we emit the pattern with explicit hard
> > regs.
> >
> > BTW: We do have SSE_FIRST_REG class, but this class was added to solve
> > some reload problems in the past by marking %xmm0 as likely spilled.
>
> Thanks for your suggestion, we have removed the register classes and 
> constraints, and
> set explicit sse hard registers in the expander. The corresponding patterns 
> are also adjusted,
>
> Update and rebased patch.

The attached patch goes only half-way to using explicit registers. As
said previously, please see how avx_vzeroall expander is generating
its insn pattern, and how *avx_vzeroall matches the generated pattern
using "vzeroall_operation" predicate.

Uros.


Re: [PATCH] LTO: get_section: add new argument

2020-10-21 Thread Martin Liška

On 10/21/20 1:17 PM, Martin Liška wrote:

On 10/21/20 12:06 PM, Jan Hubicka wrote:

I think the streaming should happen only from ipa-fnsummary.
Oriignally ipa-prop was ipa-cp only, then indirect inlining was added,
but these days we have specialized analysis pass and thus ipa-prop
should be intergrated to it.


All right, there's a WIP patch but it ICEs at various places:

gcc main.o
during IPA pass: fnsummary
lto1: internal compiler error: Segmentation fault
0xc909ff crash_signal
 /home/marxin/Programming/gcc/gcc/toplev.c:330
0x7788e6bf ???
 
/usr/src/debug/glibc-2.32-1.1.x86_64/signal/../sysdeps/unix/sysv/linux/x86_64/sigaction.c:0
0xa7cfbd hash_table_mod1(unsigned int, unsigned int)
 /home/marxin/Programming/gcc/gcc/hash-table.h:344
0xa7cfbd hash_table, ipa_node_params*, 
simple_hashmap_traits >, ipa_node_params*> >::hash_entry, 
false, xcallocator>::find_with_hash(int const&, unsigned int)
 /home/marxin/Programming/gcc/gcc/hash-table.h:911
0xa79216 hash_map, ipa_node_params*, 
simple_hashmap_traits >, ipa_node_params*> 
>::get(int const&)
 /home/marxin/Programming/gcc/gcc/hash-map.h:185
0xa79216 function_summary::get(cgraph_node*)
 /home/marxin/Programming/gcc/gcc/symbol-summary.h:163
0xa79216 inline_read_section
 /home/marxin/Programming/gcc/gcc/ipa-fnsummary.c:4314
0xa79ee0 ipa_fn_summary_read
 /home/marxin/Programming/gcc/gcc/ipa-fnsummary.c:4478
0xbc0ead ipa_read_summaries_1
 /home/marxin/Programming/gcc/gcc/passes.c:2844
0x7e31aa read_cgraph_and_symbols(unsigned int, char const**)
 /home/marxin/Programming/gcc/gcc/lto/lto-common.c:2919
0x7cb6e2 lto_main()
 /home/marxin/Programming/gcc/gcc/lto/lto.c:625

Can you please you or Martin finish the patch?
Thanks,
Martin


... adding missing patch.

Martin
>From 8c765ebad21da7f34a5038b4df8c4d29fb391055 Mon Sep 17 00:00:00 2001
From: Martin Liska 
Date: Wed, 21 Oct 2020 11:11:03 +0200
Subject: [PATCH] LTO: get_section: add new argument

gcc/ChangeLog:

	PR lto/97508
	* langhooks.c (lhd_begin_section): Call get_section with
	not_existing = true.
	* output.h (get_section): Add new argument.
	* varasm.c (get_section): Fail when NOT_EXISTING is true
	and a section already exists.
	* ipa-cp.c (ipcp_write_summary): Remove.
	(ipcp_read_summary): Likewise.
---
 gcc/ipa-cp.c| 20 ++--
 gcc/langhooks.c |  2 +-
 gcc/output.h|  3 ++-
 gcc/varasm.c| 12 ++--
 4 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index 2152f9e5876..db87329bc0c 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -5943,22 +5943,6 @@ ipcp_generate_summary (void)
 ipa_analyze_node (node);
 }
 
-/* Write ipcp summary for nodes in SET.  */
-
-static void
-ipcp_write_summary (void)
-{
-  ipa_prop_write_jump_functions ();
-}
-
-/* Read ipcp summary.  */
-
-static void
-ipcp_read_summary (void)
-{
-  ipa_prop_read_jump_functions ();
-}
-
 namespace {
 
 const pass_data pass_data_ipa_cp =
@@ -5980,8 +5964,8 @@ public:
   pass_ipa_cp (gcc::context *ctxt)
 : ipa_opt_pass_d (pass_data_ipa_cp, ctxt,
 		  ipcp_generate_summary, /* generate_summary */
-		  ipcp_write_summary, /* write_summary */
-		  ipcp_read_summary, /* read_summary */
+		  NULL, /* write_summary */
+		  NULL, /* read_summary */
 		  ipcp_write_transformation_summaries, /*
 		  write_optimization_summary */
 		  ipcp_read_transformation_summaries, /*
diff --git a/gcc/langhooks.c b/gcc/langhooks.c
index 8819a8859d4..d82f54251fd 100644
--- a/gcc/langhooks.c
+++ b/gcc/langhooks.c
@@ -790,7 +790,7 @@ lhd_begin_section (const char *name)
 saved_section = text_section;
 
   /* Create a new section and switch to it.  */
-  section = get_section (name, SECTION_DEBUG | SECTION_EXCLUDE, NULL);
+  section = get_section (name, SECTION_DEBUG | SECTION_EXCLUDE, NULL, true);
   switch_to_section (section);
 }
 
diff --git a/gcc/output.h b/gcc/output.h
index eb253c50329..2f2f1697fd8 100644
--- a/gcc/output.h
+++ b/gcc/output.h
@@ -523,7 +523,8 @@ extern GTY(()) bool in_cold_section_p;
 
 extern section *get_unnamed_section (unsigned int, void (*) (const void *),
  const void *);
-extern section *get_section (const char *, unsigned int, tree);
+extern section *get_section (const char *, unsigned int, tree,
+			 bool not_existing = false);
 extern section *get_named_section (tree, const char *, int);
 extern section *get_variable_section (tree, bool);
 extern void place_block_symbol (rtx);
diff --git a/gcc/varasm.c b/gcc/varasm.c
index ea0b59cf44a..207c9b077d1 100644
--- a/gcc/varasm.c
+++ b/gcc/varasm.c
@@ -277,10 +277,12 @@ get_noswitch_section (unsigned int flags, noswitch_section_callback callback)
 }
 
 /* Return the named section structure associated with NAME.  Create
-   a new section with the given fields if no such structure exists.  */
+   a new section with the given fields if no such structure exists.
+   When NOT_EXISTING, then fail if the section already exists.  

Re: [PATCH] LTO: get_section: add new argument

2020-10-21 Thread Martin Liška

On 10/21/20 12:06 PM, Jan Hubicka wrote:

I think the streaming should happen only from ipa-fnsummary.
Oriignally ipa-prop was ipa-cp only, then indirect inlining was added,
but these days we have specialized analysis pass and thus ipa-prop
should be intergrated to it.


All right, there's a WIP patch but it ICEs at various places:

gcc main.o
during IPA pass: fnsummary
lto1: internal compiler error: Segmentation fault
0xc909ff crash_signal
/home/marxin/Programming/gcc/gcc/toplev.c:330
0x7788e6bf ???

/usr/src/debug/glibc-2.32-1.1.x86_64/signal/../sysdeps/unix/sysv/linux/x86_64/sigaction.c:0
0xa7cfbd hash_table_mod1(unsigned int, unsigned int)
/home/marxin/Programming/gcc/gcc/hash-table.h:344
0xa7cfbd hash_table, ipa_node_params*, 
simple_hashmap_traits >, ipa_node_params*> >::hash_entry, 
false, xcallocator>::find_with_hash(int const&, unsigned int)
/home/marxin/Programming/gcc/gcc/hash-table.h:911
0xa79216 hash_map, ipa_node_params*, 
simple_hashmap_traits >, ipa_node_params*> 
>::get(int const&)
/home/marxin/Programming/gcc/gcc/hash-map.h:185
0xa79216 function_summary::get(cgraph_node*)
/home/marxin/Programming/gcc/gcc/symbol-summary.h:163
0xa79216 inline_read_section
/home/marxin/Programming/gcc/gcc/ipa-fnsummary.c:4314
0xa79ee0 ipa_fn_summary_read
/home/marxin/Programming/gcc/gcc/ipa-fnsummary.c:4478
0xbc0ead ipa_read_summaries_1
/home/marxin/Programming/gcc/gcc/passes.c:2844
0x7e31aa read_cgraph_and_symbols(unsigned int, char const**)
/home/marxin/Programming/gcc/gcc/lto/lto-common.c:2919
0x7cb6e2 lto_main()
/home/marxin/Programming/gcc/gcc/lto/lto.c:625

Can you please you or Martin finish the patch?
Thanks,
Martin


PATCH [DR2303][PR97453]

2020-10-21 Thread kamlesh kumar via Gcc-patches
gcc/cp/ChangeLog
---

2020-10-21  Kamlesh Kumar  

PR c++/97453
* pt.c (get_template_base): Implement DR2303,
Consider closest base while template
deduction when base of base also matches.

gcc/testsuite/ChangeLog
--

2020-10-21  Kamlesh Kumar  

* g++.dg/Drs/dr2303.C: New Test

--

As part of this patch I Implemented fix for below defect report in cwg
https://wg21.cmeerw.net/cwg/issue2303 .
Reg tested on x86_64 and did not found any failure.
Patch summary: Remove base of base from list of bases

created a hash_set from list of bases and then iterate over each
element of hash_set and find its  list of bases and remove this from
hash_set if present.
and finally, deduction succeeds if in hash_set remains only single
element or it's empty.
otherwise deduction is ambiguous.
---
diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c
index dc664ec3798..7adf461e108 100644
--- a/gcc/cp/pt.c
+++ b/gcc/cp/pt.c
@@ -22643,8 +22643,9 @@ static enum template_base_result
 get_template_base (tree tparms, tree targs, tree parm, tree arg,
 bool explain_p, tree *result)
 {
-  tree rval = NULL_TREE;
+  *result = NULL_TREE;
   tree binfo;
+  hash_set binfo_set;

   gcc_assert (RECORD_OR_UNION_CODE_P (TREE_CODE (arg)));

@@ -22659,31 +22660,51 @@ get_template_base (tree tparms, tree targs,
tree parm, tree arg,
   /* Walk in inheritance graph order.  The search order is not
  important, and this avoids multiple walks of virtual bases.  */
   for (binfo = TREE_CHAIN (binfo); binfo; binfo = TREE_CHAIN (binfo))
-{
-  tree r = try_class_unification (tparms, targs, parm,
-   BINFO_TYPE (binfo), explain_p);
-
-  if (r)
- {
-   /* If there is more than one satisfactory baseclass, then:
-
-[temp.deduct.call]
+ {
+   tree r = try_class_unification (tparms, targs, parm,
+   BINFO_TYPE (binfo), explain_p);
+   if (r)
+ {
+   binfo_set.add(r);
+ }
+ }

-   If they yield more than one possible deduced A, the type
-   deduction fails.
+  /* If there is more than one satisfactory baseclass, then:
+ [temp.deduct.call]
+  If they yield more than one possible deduced A, the type
+  deduction fails.
+ However, if there is a class C that is a (direct or indirect)
base class of
+ D and derived (directly or indirectly) from a class B and that would be a
+ valid deduced A, the deduced A cannot be B or pointer to B,
respectively.  */
+  for (hash_set::iterator it = binfo_set.begin();
+it != binfo_set.end(); ++it)
+{
+  binfo = TYPE_BINFO (*it);
+  for (binfo = TREE_CHAIN (binfo); binfo; binfo = TREE_CHAIN (binfo))
+{
+  tree r = try_class_unification (tparms, targs, parm,
+  BINFO_TYPE (binfo), explain_p);
+  if (r && binfo_set.contains(r))
+{
+  binfo_set.remove(r);
+}
+}
+}

-  applies.  */
-   if (rval && !same_type_p (r, rval))
- {
-   *result = NULL_TREE;
-   return tbr_ambiguous_baseclass;
- }
+  if (binfo_set.elements() > 1)
+{
+  return tbr_ambiguous_baseclass;
+}

-   rval = r;
- }
+  if (binfo_set.is_empty())
+{
+  return tbr_success;
 }

-  *result = rval;
+  if (binfo_set.elements() == 1)
+{
+  *result = *binfo_set.begin();
+}
   return tbr_success;
 }

diff --git a/gcc/testsuite/g++.dg/DRs/dr2303.C
b/gcc/testsuite/g++.dg/DRs/dr2303.C
new file mode 100644
index 000..b4c23332358
--- /dev/null
+++ b/gcc/testsuite/g++.dg/DRs/dr2303.C
@@ -0,0 +1,20 @@
+// DR 2303
+// PR c++/97453
+// { dg-do compile { target c++11 } }
+
+template 
+struct A;
+template <>
+struct A<> {};
+template 
+struct A : A {};
+struct B : A {};
+
+template 
+void f(const A &) {
+  static_assert(sizeof...(T) == 2, "it should duduce to A");
+}
+
+void g() {
+  f(B{});
+}


./kamlesh


Re: [patch] Introduce vxworks7r2 support for ppc and ppc64

2020-10-21 Thread Olivier Hainque
Hi Segher!

> On 20 Oct 2020, at 22:06, Segher Boessenkool  
> wrote:
> 
>> +# Wind River 7 post SR0600 is mostly like Linux so we setup
>> +# out config in a very similar fashion and adjust to a few
>> +# specificities.
> 
> "our config"?
> 
>> +   - Starting with VxWorks 7 (post SR600), the system environment
>> + was made extremely similar to GNU/Linux and this toolchain is
>> + builtin on top of the corresponding header files.  */
> 
> "built on top"?

Indeed.

>> +/
>> + * Common definitions first *
>> + /
> 
> We don't use such decorated comments in GCC.  But it is your header file
> of course :-)

Hmm, I’d really like to keep some visible separation for the
sections because it’s pretty dense overall and I think hard to
read without some high level hints about the general organization.

There are (a few, agreed :) instances of sectioning in other
sources, tree-core.h or tree-vectorizer.h for example, with a
different style though. I think I’ll adjust to one of these.

> I don't see anything wrong with the actual code itself, fwiw :-)

Great, feedback appreciated, thanks!

Next in line specific to rs6000 are a couple of suggestions
for updates in the testsuite (wrt fpic and dfp).

Regards,

Olivier



Re: [PATCH] LTO: get_section: add new argument

2020-10-21 Thread Jan Hubicka
> Hey.
> 
> During partial linking we ipa_prop_write_jump_functions twice from 2 IPA
> pass (fnsummary and cp). That produces 2 compressed blocks in an ELF section
> and then zstd complains as sections size does not correspond to the compressed
> stream.
> 
> I'm adding both sanity check changes and the fix in ipa-prop.c.
> I guess Martin and Honza can explain it in more detail?
> 
> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
> 
> Ready to be installed?
> Thanks,
> Martin
> 
> gcc/ChangeLog:
> 
>   PR lto/97508
>   * langhooks.c (lhd_begin_section): Call get_section with
>   not_existing = true.
>   * output.h (get_section): Add new argument.
>   * varasm.c (get_section): Fail when NOT_EXISTING is true
>   and a section already exists.
>   * ipa-prop.c (ipa_prop_write_jump_functions): Do not stream
>   twice.

I think the streaming should happen only from ipa-fnsummary.
Oriignally ipa-prop was ipa-cp only, then indirect inlining was added,
but these days we have specialized analysis pass and thus ipa-prop
should be intergrated to it.

Honza
> ---
>  gcc/ipa-prop.c  |  9 +
>  gcc/langhooks.c |  2 +-
>  gcc/output.h|  3 ++-
>  gcc/varasm.c| 12 ++--
>  4 files changed, 22 insertions(+), 4 deletions(-)
> 
> diff --git a/gcc/ipa-prop.c b/gcc/ipa-prop.c
> index a848f1db95e..d43fd2eee4f 100644
> --- a/gcc/ipa-prop.c
> +++ b/gcc/ipa-prop.c
> @@ -5067,6 +5067,13 @@ ipa_prop_write_jump_functions (void)
>lto_symtab_encoder_iterator lsei;
>lto_symtab_encoder_t encoder;
> +  /* The function can be called from 2 IPA_PASSES: "fnsummary" and "cp"
> + which happens in partial linking (-r).  Prevent double streaming
> + as reported in PR97508.  */
> +  static bool already_stremed = false;
> +  if (already_stremed)
> +return;
> +
>if (!ipa_node_params_sum || !ipa_edge_args_sum)
>  return;
> @@ -5096,6 +5103,8 @@ ipa_prop_write_jump_functions (void)
>streamer_write_char_stream (ob->main_stream, 0);
>produce_asm (ob, NULL);
>destroy_output_block (ob);
> +
> +  already_stremed = true;
>  }
>  /* Read section in file FILE_DATA of length LEN with data DATA.  */
> diff --git a/gcc/langhooks.c b/gcc/langhooks.c
> index 8819a8859d4..d82f54251fd 100644
> --- a/gcc/langhooks.c
> +++ b/gcc/langhooks.c
> @@ -790,7 +790,7 @@ lhd_begin_section (const char *name)
>  saved_section = text_section;
>/* Create a new section and switch to it.  */
> -  section = get_section (name, SECTION_DEBUG | SECTION_EXCLUDE, NULL);
> +  section = get_section (name, SECTION_DEBUG | SECTION_EXCLUDE, NULL, true);
>switch_to_section (section);
>  }
> diff --git a/gcc/output.h b/gcc/output.h
> index eb253c50329..2f2f1697fd8 100644
> --- a/gcc/output.h
> +++ b/gcc/output.h
> @@ -523,7 +523,8 @@ extern GTY(()) bool in_cold_section_p;
>  extern section *get_unnamed_section (unsigned int, void (*) (const void *),
>const void *);
> -extern section *get_section (const char *, unsigned int, tree);
> +extern section *get_section (const char *, unsigned int, tree,
> +  bool not_existing = false);
>  extern section *get_named_section (tree, const char *, int);
>  extern section *get_variable_section (tree, bool);
>  extern void place_block_symbol (rtx);
> diff --git a/gcc/varasm.c b/gcc/varasm.c
> index ea0b59cf44a..207c9b077d1 100644
> --- a/gcc/varasm.c
> +++ b/gcc/varasm.c
> @@ -277,10 +277,12 @@ get_noswitch_section (unsigned int flags, 
> noswitch_section_callback callback)
>  }
>  /* Return the named section structure associated with NAME.  Create
> -   a new section with the given fields if no such structure exists.  */
> +   a new section with the given fields if no such structure exists.
> +   When NOT_EXISTING, then fail if the section already exists.  */
>  section *
> -get_section (const char *name, unsigned int flags, tree decl)
> +get_section (const char *name, unsigned int flags, tree decl,
> +  bool not_existing)
>  {
>section *sect, **slot;
> @@ -297,6 +299,12 @@ get_section (const char *name, unsigned int flags, tree 
> decl)
>  }
>else
>  {
> +  if (not_existing)
> + {
> +   error ("Section already exists: %qs", name);
> +   gcc_unreachable ();
> + }
> +
>sect = *slot;
>/* It is fine if one of the sections has SECTION_NOTYPE as long as
>   the other has none of the contrary flags (see the logic at the end
> -- 
> 2.28.0
> 


[PATCH] LTO: get_section: add new argument

2020-10-21 Thread Martin Liška

Hey.

During partial linking we ipa_prop_write_jump_functions twice from 2 IPA
pass (fnsummary and cp). That produces 2 compressed blocks in an ELF section
and then zstd complains as sections size does not correspond to the compressed
stream.

I'm adding both sanity check changes and the fix in ipa-prop.c.
I guess Martin and Honza can explain it in more detail?

Patch can bootstrap on x86_64-linux-gnu and survives regression tests.

Ready to be installed?
Thanks,
Martin

gcc/ChangeLog:

PR lto/97508
* langhooks.c (lhd_begin_section): Call get_section with
not_existing = true.
* output.h (get_section): Add new argument.
* varasm.c (get_section): Fail when NOT_EXISTING is true
and a section already exists.
* ipa-prop.c (ipa_prop_write_jump_functions): Do not stream
twice.
---
 gcc/ipa-prop.c  |  9 +
 gcc/langhooks.c |  2 +-
 gcc/output.h|  3 ++-
 gcc/varasm.c| 12 ++--
 4 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/gcc/ipa-prop.c b/gcc/ipa-prop.c
index a848f1db95e..d43fd2eee4f 100644
--- a/gcc/ipa-prop.c
+++ b/gcc/ipa-prop.c
@@ -5067,6 +5067,13 @@ ipa_prop_write_jump_functions (void)
   lto_symtab_encoder_iterator lsei;
   lto_symtab_encoder_t encoder;
 
+  /* The function can be called from 2 IPA_PASSES: "fnsummary" and "cp"

+ which happens in partial linking (-r).  Prevent double streaming
+ as reported in PR97508.  */
+  static bool already_stremed = false;
+  if (already_stremed)
+return;
+
   if (!ipa_node_params_sum || !ipa_edge_args_sum)
 return;
 
@@ -5096,6 +5103,8 @@ ipa_prop_write_jump_functions (void)

   streamer_write_char_stream (ob->main_stream, 0);
   produce_asm (ob, NULL);
   destroy_output_block (ob);
+
+  already_stremed = true;
 }
 
 /* Read section in file FILE_DATA of length LEN with data DATA.  */

diff --git a/gcc/langhooks.c b/gcc/langhooks.c
index 8819a8859d4..d82f54251fd 100644
--- a/gcc/langhooks.c
+++ b/gcc/langhooks.c
@@ -790,7 +790,7 @@ lhd_begin_section (const char *name)
 saved_section = text_section;
 
   /* Create a new section and switch to it.  */

-  section = get_section (name, SECTION_DEBUG | SECTION_EXCLUDE, NULL);
+  section = get_section (name, SECTION_DEBUG | SECTION_EXCLUDE, NULL, true);
   switch_to_section (section);
 }
 
diff --git a/gcc/output.h b/gcc/output.h

index eb253c50329..2f2f1697fd8 100644
--- a/gcc/output.h
+++ b/gcc/output.h
@@ -523,7 +523,8 @@ extern GTY(()) bool in_cold_section_p;
 
 extern section *get_unnamed_section (unsigned int, void (*) (const void *),

 const void *);
-extern section *get_section (const char *, unsigned int, tree);
+extern section *get_section (const char *, unsigned int, tree,
+bool not_existing = false);
 extern section *get_named_section (tree, const char *, int);
 extern section *get_variable_section (tree, bool);
 extern void place_block_symbol (rtx);
diff --git a/gcc/varasm.c b/gcc/varasm.c
index ea0b59cf44a..207c9b077d1 100644
--- a/gcc/varasm.c
+++ b/gcc/varasm.c
@@ -277,10 +277,12 @@ get_noswitch_section (unsigned int flags, 
noswitch_section_callback callback)
 }
 
 /* Return the named section structure associated with NAME.  Create

-   a new section with the given fields if no such structure exists.  */
+   a new section with the given fields if no such structure exists.
+   When NOT_EXISTING, then fail if the section already exists.  */
 
 section *

-get_section (const char *name, unsigned int flags, tree decl)
+get_section (const char *name, unsigned int flags, tree decl,
+bool not_existing)
 {
   section *sect, **slot;
 
@@ -297,6 +299,12 @@ get_section (const char *name, unsigned int flags, tree decl)

 }
   else
 {
+  if (not_existing)
+   {
+ error ("Section already exists: %qs", name);
+ gcc_unreachable ();
+   }
+
   sect = *slot;
   /* It is fine if one of the sections has SECTION_NOTYPE as long as
  the other has none of the contrary flags (see the logic at the end
--
2.28.0



[PATCH] tree-optimization/97500 - avoid SLP backedges for inductions

2020-10-21 Thread Richard Biener
Inductions are not vectorized as cycle but materialized from SCEV data.
Filling in backedge SLP nodes confuses this process.

Bootstrapped / tested on x86_64-unknown-linux-gnu, pushed.

2020-10-21  Richard Biener  

PR tree-optimization/97500
* tree-vect-slp.c (vect_analyze_slp_backedges): Do not
fill backedges for inductions.

* gfortran.dg/pr97500.f90: New testcase.
---
 gcc/testsuite/gfortran.dg/pr97500.f90 | 35 +++
 gcc/tree-vect-slp.c   |  6 +
 2 files changed, 41 insertions(+)
 create mode 100644 gcc/testsuite/gfortran.dg/pr97500.f90

diff --git a/gcc/testsuite/gfortran.dg/pr97500.f90 
b/gcc/testsuite/gfortran.dg/pr97500.f90
new file mode 100644
index 000..d63b8616ad6
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/pr97500.f90
@@ -0,0 +1,35 @@
+! { dg-do run }
+! { dg-additional-options "-ftree-vectorize -fno-guess-branch-probability" }
+module testmod
+  implicit none
+
+  contains
+
+  subroutine foo(n)
+integer, intent(in) :: n
+real :: r(0:n,-n:n), a(0:n,-n:n), dj
+integer :: k, j
+
+! initialize with some dummy values
+do j = -n, n
+  a(:, j) = j
+  r(:,j) = j + 1
+end do
+
+! here be dragons
+do k = 0, n
+  dj = r(k, k - 2) * a(k, k - 2)
+  r(k,k) = a(k, k - 1) * dj
+enddo
+
+if (r(0,0) .ne. -2.) STOP 1
+
+  end subroutine
+
+end module
+
+program test
+  use testmod
+  implicit none
+  call foo(5)
+end program
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 0c1447e7aa0..e3f94cb8a2d 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2380,6 +2380,12 @@ vect_analyze_slp_backedges (vec_info *vinfo, slp_tree 
node,
 if (child)
   vect_analyze_slp_backedges (vinfo, child, bst_map, visited);
 
+  /* Inductions are not vectorized by vectorizing their defining cycle
+ but by materializing the values from SCEV data.  */
+  if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (node))
+  == vect_induction_def)
+return;
+
   if (gphi *phi = dyn_cast  (SLP_TREE_REPRESENTATIVE (node)->stmt))
 for (unsigned i = 0; i < gimple_phi_num_args (phi); ++i)
   {
-- 
2.26.2


Re: [PATCH][PR 97506] Simplify trivial vcond_expr in expander.

2020-10-21 Thread Hongtao Liu via Gcc-patches
On Wed, Oct 21, 2020 at 5:07 PM Jakub Jelinek  wrote:
>
> On Wed, Oct 21, 2020 at 02:29:07PM +0800, Hongtao Liu via Gcc-patches wrote:
> > gcc/ChangeLog:
> >
> > PR target/97506
> > * config/i386/i386-expand.c (ix86_expand_sse_movcc): Move
> > op_true to dest directly When op_true equals op_false,
>
> Lowercase when in the middle of sentence.  Use . instead of , at the end.
>
> > --- a/gcc/config/i386/i386-expand.c
> > +++ b/gcc/config/i386/i386-expand.c
> > @@ -3525,6 +3525,13 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx
> > op_true, rtx op_false)
> >machine_mode mode = GET_MODE (dest);
> >machine_mode cmpmode = GET_MODE (cmp);
> >
> > +  /* Simplify trivial vcond_expr to avoid ICE error in pr97506.  */
>
> There is no such thing as vcond_expr, I'd say use VEC_COND_EXPR instead.
> Please change ICE error to just ICE, ICE stands for internal compiler error,
> so the error word is in there already.
>
> Otherwise LGTM.
>

Thanks for the review, i'll commit the patch with upper adjustment.

> > +  if (rtx_equal_p (op_true, op_false))
> > +{
> > +  emit_move_insn (dest, op_true);
> > +  return;
> > +}
> > +
> >/* In AVX512F the result of comparison is an integer mask.  */
> >bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode);
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/pr97506.c
> > b/gcc/testsuite/gcc.target/i386/pr97506.c
> > new file mode 100644
> > index 000..74714cfab2c
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr97506.c
> > @@ -0,0 +1,19 @@
> > +/* PR target/97506  */
> > +/* { dg-do compile } */
> > +/* { dg-options "-Og -finline-functions-called-once -fno-tree-ccp
> > -mavx512vbmi -mavx512vl" } */
> > +
> > +typedef unsigned char __attribute__ ((__vector_size__ (16))) U;
> > +typedef int __attribute__ ((__vector_size__ (4))) V;
> > +U u;
> > +
> > +void
> > +bar (int i, V v)
> > +{
> > +  u += (char) i & (char) i > (U){};
> > +}
> > +
> > +void
> > +foo (void)
> > +{
> > +  bar (0, (V){});
> > +}
> > --
> > 2.18.1
> >
> >
> > --
> > BR,
> > Hongtao
>
> Jakub
>


-- 
BR,
Hongtao


Re: [PATCH] vect: Remove redundant LOOP_VINFO_FULLY_MASKED_P

2020-10-21 Thread Richard Biener via Gcc-patches
On Wed, Oct 21, 2020 at 10:58 AM Kewen.Lin via Gcc-patches
 wrote:
>
> Hi,
>
> This is a very trivial patch, it's to remove a redundant
> LOOP_VINFO_FULLY_MASKED_P condition check which will be
> checked in vect_use_loop_mask_for_alignment_p.
>
> Is it OK for trunk?

OK.

>
> BR,
> Kewen
> -
> gcc/ChangeLog:
>
> * tree-vect-loop.c (vect_transform_loop): Remove the redundant
> LOOP_VINFO_FULLY_MASKED_P check.
>
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index dba230f6320..5e9e25add73 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -8913,8 +8913,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple 
> *loop_vectorized_call)
>
>split_edge (loop_preheader_edge (loop));
>
> -  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> -  && vect_use_loop_mask_for_alignment_p (loop_vinfo))
> +  if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
>  /* This will deal with any possible peeling.  */
>  vect_prepare_for_masked_peels (loop_vinfo);


Re: [PATCH] Saturate overflows return from SCEV in ranger.

2020-10-21 Thread Richard Biener via Gcc-patches
On Wed, Oct 21, 2020 at 10:50 AM Aldy Hernandez  wrote:
>
>
>
> On 10/21/20 9:59 AM, Richard Biener wrote:
>
> >>> /* Even for valid range info, sometimes overflow flag will leak in.
> >>>As GIMPLE IL should have no constants with TREE_OVERFLOW set, we
> >>>drop them.  */
> >>> if (TREE_OVERFLOW_P (*min))
> >>>   *min = drop_tree_overflow (*min);
> >>> if (TREE_OVERFLOW_P (*max))
> >>>   *max = drop_tree_overflow (*max);
> >>
> >> Interesting.
> >>
> >> If these values "leaked" in.  Should they have been fixed at the source,
> >> instead of after the fact?  You mention below that every use of
> >> TREE_OVERFLOW in the ME is a bug, should we clean them up before
> >> arriving in gimple, or are there legitimate uses of it?
> >
> > There are no legitimate uses in GIMPLE.  They are (ab-)used by
> > GENERIC folding for propagating overflow (also used in FE
> > diagnostics).  Generally the better way is to use wide_ints overflow
> > handling which also "sticks".
>
> If there are no legitimate uses, perhaps we should drop them altogether
> as we go into GIMPLE??

We do, but they tend to creep back in by infrastructure using the
GENERIC folder.  Some key places make sure to clear them (I've tracked
down a lot of them).  But I was never bave enough to assert they do
not end up in IL operands ;)

>  I vaguely recall seeing them leak into
> value_range's.
> >
> >>>
> >>> and the code explicitly checks for overflow, doing range adjustments
> >>> accordingly.
> >>
> >> Well, not all overflows are adjusted:
> >>
> >> /* Like in PR19590, scev can return a constant function.  */
> >> if (is_gimple_min_invariant (chrec))
> >>   {
> >> *min = *max = chrec;
> >> return true;
> >>   }
> >>
> >> Are these min/max not adjusted for overflow by design, or is this an
> >> oversight?
> >
> > Ah, that's an oversight here.  And yes, "fixing" it in scalar evolution
> > analysis itself (dropping the flag there) would be best
>
> Excellent.  I've pushed the patch below after testing it.
>
> Thanks again.
> Aldy
>
>  Adjust overflow for invariants in bounds_of_var_in_loop.
>
>  Invariants returned from SCEV can have TREE_OVERFLOW set.  Clear the
>  overflow as we do with the rest of the values returned from this
>  function.
>
>  gcc/ChangeLog:
>
>  * gimple-range.cc
> (gimple_ranger::range_of_ssa_name_with_loop_info):
>  Remove TREE_OVERFLOW special case.
>  * vr-values.c (bounds_of_var_in_loop): Adjust overflow for
>  invariants.
>
> diff --git a/gcc/gimple-range.cc b/gcc/gimple-range.cc
> index b790d62d75f..c5520e0700b 100644
> --- a/gcc/gimple-range.cc
> +++ b/gcc/gimple-range.cc
> @@ -1156,9 +1156,9 @@ gimple_ranger::range_of_ssa_name_with_loop_info
> (irange , tree name,
> // ?? We could do better here.  Since MIN/MAX can only be an
> // SSA, SSA +- INTEGER_CST, or INTEGER_CST, we could easily call
> // the ranger and solve anything not an integer.
> -  if (TREE_CODE (min) != INTEGER_CST || TREE_OVERFLOW (min))
> +  if (TREE_CODE (min) != INTEGER_CST)
> min = vrp_val_min (type);
> -  if (TREE_CODE (max) != INTEGER_CST || TREE_OVERFLOW (max))
> +  if (TREE_CODE (max) != INTEGER_CST)
> max = vrp_val_max (type);
> r.set (min, max);
>   }
> diff --git a/gcc/vr-values.c b/gcc/vr-values.c
> index cc0ddca2bd3..7a0e70eab64 100644
> --- a/gcc/vr-values.c
> +++ b/gcc/vr-values.c
> @@ -1844,7 +1844,7 @@ bounds_of_var_in_loop (tree *min, tree *max,
> range_query *query,
> if (is_gimple_min_invariant (chrec))
>   {
> *min = *max = chrec;
> -  return true;
> +  goto fix_overflow;
>   }
>
> if (TREE_CODE (chrec) != POLYNOMIAL_CHREC)
> @@ -1964,6 +1964,7 @@ bounds_of_var_in_loop (tree *min, tree *max,
> range_query *query,
> else
>   *min = init;
>
> + fix_overflow:
> /* Even for valid range info, sometimes overflow flag will leak in.
>As GIMPLE IL should have no constants with TREE_OVERFLOW set, we
>drop them.  */
>


[PATCH] aarch64: Add vcopy(q)__lane(q)_bf16 intrinsics

2020-10-21 Thread Andrea Corallo via Gcc-patches
Hi all,

I'd like to submit the following patch implementing the bfloat16_t
neon related copy intrinsics: vcopy_lane_bf16, vcopyq_lane_bf16,
vcopyq_laneq_bf16, vcopy_laneq_bf16.

Please see refer to:
ACLE 
ISA  

Regtested and bootstrapped.

Regards

  Andrea

>From d1335c0f49df849b87ee522e9507023113051839 Mon Sep 17 00:00:00 2001
From: Andrea Corallo 
Date: Thu, 8 Oct 2020 12:29:00 +0200
Subject: [PATCH] aarch64: Add vcopy(q)__lane(q)_bf16 intrinsics

gcc/ChangeLog

2020-10-20  Andrea Corallo  

* config/aarch64/arm_neon.h (vcopy_lane_bf16, vcopyq_lane_bf16)
(vcopyq_laneq_bf16, vcopy_laneq_bf16): New intrinsics.

gcc/testsuite/ChangeLog

2020-10-20  Andrea Corallo  

* gcc.target/aarch64/advsimd-intrinsics/bf16_vect_copy_lane_1.c:
New test.
* gcc.target/aarch64/advsimd-intrinsics/vcopy_lane_bf16_indices_1.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vcopy_lane_bf16_indices_2.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vcopy_laneq_bf16_indices_1.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vcopy_laneq_bf16_indices_2.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vcopyq_lane_bf16_indices_1.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vcopyq_lane_bf16_indices_2.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vcopyq_laneq_bf16_indices_1.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vcopyq_laneq_bf16_indices_2.c:
Likewise.
---
 gcc/config/aarch64/arm_neon.h | 36 +++
 .../bf16_vect_copy_lane_1.c   | 32 +
 .../vcopy_lane_bf16_indices_1.c   | 18 ++
 .../vcopy_lane_bf16_indices_2.c   | 18 ++
 .../vcopy_laneq_bf16_indices_1.c  | 17 +
 .../vcopy_laneq_bf16_indices_2.c  | 17 +
 .../vcopyq_lane_bf16_indices_1.c  | 17 +
 .../vcopyq_lane_bf16_indices_2.c  | 17 +
 .../vcopyq_laneq_bf16_indices_1.c | 17 +
 .../vcopyq_laneq_bf16_indices_2.c | 17 +
 10 files changed, 206 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vect_copy_lane_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_lane_bf16_indices_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_lane_bf16_indices_2.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_laneq_bf16_indices_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopy_laneq_bf16_indices_2.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_lane_bf16_indices_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_lane_bf16_indices_2.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_laneq_bf16_indices_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcopyq_laneq_bf16_indices_2.c

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 0088ea9896f..9c801661775 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -35155,6 +35155,42 @@ vcvtq_high_bf16_f32 (bfloat16x8_t __inactive, 
float32x4_t __a)
   return __builtin_aarch64_bfcvtn2v8bf (__inactive, __a);
 }
 
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_lane_bf16 (bfloat16x4_t __a, const int __lane1,
+bfloat16x4_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+ __a, __lane1);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_lane_bf16 (bfloat16x8_t __a, const int __lane1,
+ bfloat16x4_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+ __a, __lane1);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_laneq_bf16 (bfloat16x4_t __a, const int __lane1,
+ bfloat16x8_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+ __a, __lane1);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_laneq_bf16 (bfloat16x8_t __a, const int __lane1,
+  bfloat16x8_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+ __a, __lane1);

[PATCH 2/2] libsanitizer: add test-case

2020-10-21 Thread Martin Liška

gcc/testsuite/ChangeLog:

PR sanitizer/97414
* g++.dg/asan/pr97414.C: New test.

(cherry picked from commit 6c5b08a2ca935c5db68e79d33e5c5b752252115c)
---
 gcc/testsuite/g++.dg/asan/pr97414.C | 19 +++
 1 file changed, 19 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/asan/pr97414.C

diff --git a/gcc/testsuite/g++.dg/asan/pr97414.C 
b/gcc/testsuite/g++.dg/asan/pr97414.C
new file mode 100644
index 000..6ea03906daa
--- /dev/null
+++ b/gcc/testsuite/g++.dg/asan/pr97414.C
@@ -0,0 +1,19 @@
+/* PR sanitizer/97414 */
+/* { dg-do run } */
+/* { dg-set-target-env-var ASAN_OPTIONS 
"detect_invalid_pointer_pairs=1:halt_on_error=1,detect_stack_use_after_return=1"
 } */
+/* { dg-options "-fsanitize=address,pointer-compare,pointer-subtract" } */
+
+[[gnu::noinline]] auto pointer_diff(const int *begin, const int *end) {
+  return end - begin;
+}
+
+int main() {
+  constexpr auto size = (2048 / sizeof(int)) + 1;
+
+  auto buf = new int[size];
+  auto end = buf + size;
+  pointer_diff(end, buf);
+  delete[] buf;
+
+  return 0;
+}
--
2.28.0



[PATCH 1/2] ASAN: Support detect_invalid_pointer_pairs=1 with detect_stack_use_after_return=1

2020-10-21 Thread Martin Liška

Do not crash when AsanThread::GetStackVariableShadowStart does not find
a variable for a pointer on a shadow stack.

Cherry-pick from ad2be02a833e56f7fe280797280b219eb3312621.

Differential Revision: https://reviews.llvm.org/D89552

(cherry picked from commit b69f33f477b9ac38af3c39465600ae74a3554878)
---
 libsanitizer/asan/asan_thread.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libsanitizer/asan/asan_thread.cpp 
b/libsanitizer/asan/asan_thread.cpp
index 6734d9a1668..cb374b28622 100644
--- a/libsanitizer/asan/asan_thread.cpp
+++ b/libsanitizer/asan/asan_thread.cpp
@@ -366,7 +366,9 @@ uptr AsanThread::GetStackVariableShadowStart(uptr addr) {
 bottom = stack_bottom();
   } else if (has_fake_stack()) {
 bottom = fake_stack()->AddrIsInFakeStack(addr);
-CHECK(bottom);
+if (bottom == 0) {
+  return 0;
+}
   } else {
 return 0;
   }
--
2.28.0




Re: PING [PATCH] Enable GCC support for Intel Key Locker extension

2020-10-21 Thread Hongyu Wang via Gcc-patches
Hi,

> IIRC, adding a new regclass is O(n^2), so it should be avoided. I
> think that the new patterns should follow the same path as vzeroall
> and vzeroupper patterns, where we emit the pattern with explicit hard
> regs.
>
> BTW: We do have SSE_FIRST_REG class, but this class was added to solve
> some reload problems in the past by marking %xmm0 as likely spilled.

Thanks for your suggestion, we have removed the register classes and
constraints, and
set explicit sse hard registers in the expander. The corresponding patterns
are also adjusted,

Update and rebased patch.


Uros Bizjak  于2020年10月14日周三 下午11:56写道:

> Hello!
>
> > This patch is about to support Intel Key Locker extension.
> >
> > Key Locker provides a mechanism to encrypt and decrypt data with an AES
> key without having access to the raw key value.
> >
> > For more details, please refer to
>
> https://software.intel.com/content/dam/develop/external/us/en/documents/343965-intel-key-locker-specification.pdf
> .
> >
> > Bootstrap ok, regression test on i386/x86 backend is ok.
> >
> > OK for master?
>
> @@ -1414,6 +1418,13 @@ enum reg_class
>FP_TOP_REG, FP_SECOND_REG, /* %st(0) %st(1) */
>FLOAT_REGS,
>SSE_FIRST_REG,
> +  SSE_SECOND_REG,
> +  SSE_THIRD_REG,
> +  SSE_FOURTH_REG,
> +  SSE_FIFTH_REG,
> +  SSE_SIXTH_REG,
> +  SSE_SEVENTH_REG,
> +  SSE_EIGHTH_REG,
>NO_REX_SSE_REGS,
>SSE_REGS,
>ALL_SSE_REGS,
> @@ -1474,6 +1485,13 @@ enum reg_class
> "FP_TOP_REG", "FP_SECOND_REG", \
> "FLOAT_REGS", \
> "SSE_FIRST_REG", \
> +   "SSE_SECOND_REG", \
> +   "SSE_THIRD_REG", \
> +   "SSE_FOURTH_REG", \
> +   "SSE_FIFTH_REG", \
> +   "SSE_SIXTH_REG", \
> +   "SSE_SEVENTH_REG", \
> +   "SSE_EIGHTH_REG", \
> "NO_REX_SSE_REGS", \
> "SSE_REGS", \
> "ALL_SSE_REGS", \
> @@ -1513,6 +1531,13 @@ enum reg_class
>   { 0x200,0x0,   0x0 }, /* FP_SECOND_REG */ \
>  { 0xff00,0x0,   0x0 }, /* FLOAT_REGS */ \
>{ 0x10,0x0,   0x0 }, /* SSE_FIRST_REG */ \
> +  { 0x20,0x0,   0x0 }, /* SSE_SECOND_REG */ \
> +  { 0x40,0x0,   0x0 }, /* SSE_THIRD_REG */ \
> +  { 0x80,0x0,   0x0 }, /* SSE_FOURTH_REG */ \
> + { 0x100,0x0,   0x0 }, /* SSE_FIFTH_REG */ \
> + { 0x200,0x0,   0x0 }, /* SSE_SIXTH_REG*/ \
> + { 0x400,0x0,   0x0 }, /* SSE_SEVENTH_REG */ \
> + { 0x800,0x0,   0x0 }, /* SSE_EIGHTH_REG */ \
>   { 0xff0,0x0,   0x0 }, /* NO_REX_SSE_REGS */ \
>   { 0xff0,0xff000,   0x0 }, /* SSE_REGS */ \
>   { 0xff0, 0xf000,   0xf }, /* ALL_SSE_REGS */ \
>
> IIRC, adding a new regclass is O(n^2), so it should be avoided. I
> think that the new patterns should follow the same path as vzeroall
> and vzeroupper patterns, where we emit the pattern with explicit hard
> regs.
>
> BTW: We do have SSE_FIRST_REG class, but this class was added to solve
> some reload problems in the past by marking %xmm0 as likely spilled.
>
> Uros.
>
From 6fcb89ab7f51de70baca12e46a14fb2d1fed67d5 Mon Sep 17 00:00:00 2001
From: liuhongt 
Date: Thu, 5 Mar 2020 17:36:02 +0800
Subject: [PATCH] Enable GCC to support Intel Key Locker ISA

gcc/ChangeLog

2018-12-15  Xuepeng Guo  

	* common/config/i386/cpuinfo.h (get_available_features):
	Detect KL, AESKLE and WIDEKL features.
	* common/config/i386/i386-common.c
	(OPTION_MASK_ISA_KL_SET): New.
	(OPTION_MASK_ISA_WIDEKL_SET): Likewise.
	(OPTION_MASK_ISA_KL_UNSET): Likewise.
	(OPTION_MASK_ISA_WIDEKL_UNSET): Likewise.
	(OPTION_MASK_ISA2_AVX2_UNSET): Likewise.
	(OPTION_MASK_ISA2_AVX_UNSET): Likewise.
	(OPTION_MASK_ISA2_SSE4_2_UNSET): Likewise.
	(OPTION_MASK_ISA2_SSE4_1_UNSET): Likewise.
	(OPTION_MASK_ISA2_SSE4_UNSET): Likewise.
	(OPTION_MASK_ISA2_SSSE3_UNSET): Likewise.
	(OPTION_MASK_ISA2_SSE3_UNSET): Likewise.
	(OPTION_MASK_ISA2_SSE2_UNSET): Likewise.
	(OPTION_MASK_ISA2_SSE_UNSET): Likewise.
	(ix86_handle_option): Handle kl and widekl, add dependency chain
	for KL and SSE2.
	* common/config/i386/i386-cpuinfo.h (enum processor_features):
	(FEATURE_KL, FEATURE_AESKLE, FEATURE_WIDEKL): New.
	* common/config/i386/i386-isas.h: Add ISA_NAMES_TABLE_ENTRY
	for KL, AESKLE and WIDEKL.
	* config.gcc: Add keylockerintrin.h.
	* doc/invoke.texi: Document new option -mkl and -mwidekl.
	* doc/extend.texi: Document kl and widekl.
	* config/i386/cpuid.h (bit_KL, bit_AESKLE, bit_WIDEKL): New.
	* config/i386/i386-builtin-types.def ((UINT, UINT, V2DI, V2DI, PVOID),
	(UINT, UINT, V2DI, PVOID), (VOID, V2DI, V2DI, V2DI, UINT),
	(UINT8, PV2DI, V2DI, PCVOID), (UINT8, PV2DI, PCV2DI, PCVOID)): New
	function types.
	* config/i386/i386-builtin.def: Add
	__builtin_ia32_loadiwkey,
	__builtin_ia32_aesdec128kl_u8,
	__builtin_ia32_aesdec256kl_u8,
	__builtin_ia32_aesenc128kl_u8,
	__builtin_ia32_aesenc256kl_u8,
	__builtin_ia32_aesdecwide128kl_u8,
	__builtin_ia32_aesdecwide256kl_u8,
	__builtin_ia32_aesencwide128kl_u8,
	__builtin_ia32_aesencwide256kl_u8,
	__builtin_ia32_encodekey128_u32,
	__builtin_ia32_encodekey256_u32.
	* 

Re: [PATCH][PR 97506] Simplify trivial vcond_expr in expander.

2020-10-21 Thread Jakub Jelinek via Gcc-patches
On Wed, Oct 21, 2020 at 02:29:07PM +0800, Hongtao Liu via Gcc-patches wrote:
> gcc/ChangeLog:
> 
> PR target/97506
> * config/i386/i386-expand.c (ix86_expand_sse_movcc): Move
> op_true to dest directly When op_true equals op_false,

Lowercase when in the middle of sentence.  Use . instead of , at the end.

> --- a/gcc/config/i386/i386-expand.c
> +++ b/gcc/config/i386/i386-expand.c
> @@ -3525,6 +3525,13 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx
> op_true, rtx op_false)
>machine_mode mode = GET_MODE (dest);
>machine_mode cmpmode = GET_MODE (cmp);
> 
> +  /* Simplify trivial vcond_expr to avoid ICE error in pr97506.  */

There is no such thing as vcond_expr, I'd say use VEC_COND_EXPR instead.
Please change ICE error to just ICE, ICE stands for internal compiler error,
so the error word is in there already.

Otherwise LGTM.

> +  if (rtx_equal_p (op_true, op_false))
> +{
> +  emit_move_insn (dest, op_true);
> +  return;
> +}
> +
>/* In AVX512F the result of comparison is an integer mask.  */
>bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode);
> 
> diff --git a/gcc/testsuite/gcc.target/i386/pr97506.c
> b/gcc/testsuite/gcc.target/i386/pr97506.c
> new file mode 100644
> index 000..74714cfab2c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr97506.c
> @@ -0,0 +1,19 @@
> +/* PR target/97506  */
> +/* { dg-do compile } */
> +/* { dg-options "-Og -finline-functions-called-once -fno-tree-ccp
> -mavx512vbmi -mavx512vl" } */
> +
> +typedef unsigned char __attribute__ ((__vector_size__ (16))) U;
> +typedef int __attribute__ ((__vector_size__ (4))) V;
> +U u;
> +
> +void
> +bar (int i, V v)
> +{
> +  u += (char) i & (char) i > (U){};
> +}
> +
> +void
> +foo (void)
> +{
> +  bar (0, (V){});
> +}
> -- 
> 2.18.1
> 
> 
> -- 
> BR,
> Hongtao

Jakub



[PATCH] vect: Remove redundant LOOP_VINFO_FULLY_MASKED_P

2020-10-21 Thread Kewen.Lin via Gcc-patches
Hi,

This is a very trivial patch, it's to remove a redundant
LOOP_VINFO_FULLY_MASKED_P condition check which will be
checked in vect_use_loop_mask_for_alignment_p.

Is it OK for trunk?

BR,
Kewen
-
gcc/ChangeLog:

* tree-vect-loop.c (vect_transform_loop): Remove the redundant
LOOP_VINFO_FULLY_MASKED_P check.

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index dba230f6320..5e9e25add73 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -8913,8 +8913,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple 
*loop_vectorized_call)

   split_edge (loop_preheader_edge (loop));

-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
-  && vect_use_loop_mask_for_alignment_p (loop_vinfo))
+  if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
 /* This will deal with any possible peeling.  */
 vect_prepare_for_masked_peels (loop_vinfo);


Re: [PATCH] Saturate overflows return from SCEV in ranger.

2020-10-21 Thread Aldy Hernandez via Gcc-patches




On 10/21/20 9:59 AM, Richard Biener wrote:


/* Even for valid range info, sometimes overflow flag will leak in.
   As GIMPLE IL should have no constants with TREE_OVERFLOW set, we
   drop them.  */
if (TREE_OVERFLOW_P (*min))
  *min = drop_tree_overflow (*min);
if (TREE_OVERFLOW_P (*max))
  *max = drop_tree_overflow (*max);


Interesting.

If these values "leaked" in.  Should they have been fixed at the source,
instead of after the fact?  You mention below that every use of
TREE_OVERFLOW in the ME is a bug, should we clean them up before
arriving in gimple, or are there legitimate uses of it?


There are no legitimate uses in GIMPLE.  They are (ab-)used by
GENERIC folding for propagating overflow (also used in FE
diagnostics).  Generally the better way is to use wide_ints overflow
handling which also "sticks".


If there are no legitimate uses, perhaps we should drop them altogether 
as we go into GIMPLE??  I vaguely recall seeing them leak into 
value_range's.






and the code explicitly checks for overflow, doing range adjustments
accordingly.


Well, not all overflows are adjusted:

/* Like in PR19590, scev can return a constant function.  */
if (is_gimple_min_invariant (chrec))
  {
*min = *max = chrec;
return true;
  }

Are these min/max not adjusted for overflow by design, or is this an
oversight?


Ah, that's an oversight here.  And yes, "fixing" it in scalar evolution
analysis itself (dropping the flag there) would be best


Excellent.  I've pushed the patch below after testing it.

Thanks again.
Aldy

Adjust overflow for invariants in bounds_of_var_in_loop.

Invariants returned from SCEV can have TREE_OVERFLOW set.  Clear the
overflow as we do with the rest of the values returned from this
function.

gcc/ChangeLog:

* gimple-range.cc 
(gimple_ranger::range_of_ssa_name_with_loop_info):

Remove TREE_OVERFLOW special case.
* vr-values.c (bounds_of_var_in_loop): Adjust overflow for
invariants.

diff --git a/gcc/gimple-range.cc b/gcc/gimple-range.cc
index b790d62d75f..c5520e0700b 100644
--- a/gcc/gimple-range.cc
+++ b/gcc/gimple-range.cc
@@ -1156,9 +1156,9 @@ gimple_ranger::range_of_ssa_name_with_loop_info 
(irange , tree name,

   // ?? We could do better here.  Since MIN/MAX can only be an
   // SSA, SSA +- INTEGER_CST, or INTEGER_CST, we could easily call
   // the ranger and solve anything not an integer.
-  if (TREE_CODE (min) != INTEGER_CST || TREE_OVERFLOW (min))
+  if (TREE_CODE (min) != INTEGER_CST)
min = vrp_val_min (type);
-  if (TREE_CODE (max) != INTEGER_CST || TREE_OVERFLOW (max))
+  if (TREE_CODE (max) != INTEGER_CST)
max = vrp_val_max (type);
   r.set (min, max);
 }
diff --git a/gcc/vr-values.c b/gcc/vr-values.c
index cc0ddca2bd3..7a0e70eab64 100644
--- a/gcc/vr-values.c
+++ b/gcc/vr-values.c
@@ -1844,7 +1844,7 @@ bounds_of_var_in_loop (tree *min, tree *max, 
range_query *query,

   if (is_gimple_min_invariant (chrec))
 {
   *min = *max = chrec;
-  return true;
+  goto fix_overflow;
 }

   if (TREE_CODE (chrec) != POLYNOMIAL_CHREC)
@@ -1964,6 +1964,7 @@ bounds_of_var_in_loop (tree *min, tree *max, 
range_query *query,

   else
 *min = init;

+ fix_overflow:
   /* Even for valid range info, sometimes overflow flag will leak in.
  As GIMPLE IL should have no constants with TREE_OVERFLOW set, we
  drop them.  */



[PATCH 1/2] Separate new_edges compute in copy_bbs

2020-10-21 Thread Richard Biener
This separates out a loop finding new_edges from edges in copy_bbs,
making its complexity cheaper overall from total number of succs in
copied bbs times num_edges to num_edges times the complexity of
find_edge.

Bootstrapped / tested on x86_64-unknown-linux-gnu, pushed.

2020-10-21  Richard Biener  

* cfghooks.c (copy_bbs): Split out loop computing new_edges.
---
 gcc/cfghooks.c | 21 +++--
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/gcc/cfghooks.c b/gcc/cfghooks.c
index 71c6b63ad3b..14c006df6e1 100644
--- a/gcc/cfghooks.c
+++ b/gcc/cfghooks.c
@@ -1391,8 +1391,6 @@ copy_bbs (basic_block *bbs, unsigned n, basic_block 
*new_bbs,
 }
 
   /* Redirect edges.  */
-  for (j = 0; j < num_edges; j++)
-new_edges[j] = NULL;
   for (i = 0; i < n; i++)
 {
   edge_iterator ei;
@@ -1401,15 +1399,26 @@ copy_bbs (basic_block *bbs, unsigned n, basic_block 
*new_bbs,
 
   FOR_EACH_EDGE (e, ei, new_bb->succs)
{
- for (j = 0; j < num_edges; j++)
-   if (edges[j] && edges[j]->src == bb && edges[j]->dest == e->dest)
- new_edges[j] = e;
-
  if (!(e->dest->flags & BB_DUPLICATED))
continue;
  redirect_edge_and_branch_force (e, get_bb_copy (e->dest));
}
 }
+  for (j = 0; j < num_edges; j++)
+{
+  if (!edges[j])
+   new_edges[j] = NULL;
+  else
+   {
+ basic_block src = edges[j]->src;
+ basic_block dest = edges[j]->dest;
+ if (src->flags & BB_DUPLICATED)
+   src = get_bb_copy (src);
+ if (dest->flags & BB_DUPLICATED)
+   dest = get_bb_copy (dest);
+ new_edges[j] = find_edge (src, dest);
+   }
+}
 
   /* Clear information about duplicates.  */
   for (i = 0; i < n; i++)
-- 
2.26.2



[PATCH 2/2] Simplify CFG copying tables

2020-10-21 Thread Richard Biener
This simplifies the maps between original and new basic blocks and
loops as used for CFG copying.  Instead of using a pointer hash
table to allocated mapping entries use a hash_map with int_hash,
removing the indirection and code duplication.  We can use -1 and
-2 as empty/deleted values as those are not valid basic-block
indices or loop numbers.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

2020-10-21  Richard Biener  

* cfg.c (htab_bb_copy_original_entry): Remove.
(bb_copy_hasher): Likewise.
(bb_original, bb_copy, loop_copy): Use
hash_map, int>.
(original_copy_bb_pool): Remove.
(initialize_original_copy_tables): Adjust.
(reset_original_copy_tables): Likewise.
(free_original_copy_tables): Likewise.
(original_copy_tables_initialized_p): Likewise.
(copy_original_table_clear): Simplify.
(copy_original_table_set): Likewise.
(get_bb_original): Likewise.
(get_bb_copy): Likewise.
(get_loop_copy): Likewise.
---
 gcc/cfg.c | 114 +++---
 1 file changed, 23 insertions(+), 91 deletions(-)

diff --git a/gcc/cfg.c b/gcc/cfg.c
index 270a48f729a..d82324faf03 100644
--- a/gcc/cfg.c
+++ b/gcc/cfg.c
@@ -959,55 +959,23 @@ scale_bbs_frequencies (basic_block *bbs, int nbbs,
 bbs[i]->count = bbs[i]->count.apply_probability (p);
 }
 
-/* Helper types for hash tables.  */
-
-struct htab_bb_copy_original_entry
-{
-  /* Block we are attaching info to.  */
-  int index1;
-  /* Index of original or copy (depending on the hashtable) */
-  int index2;
-};
-
-struct bb_copy_hasher : nofree_ptr_hash 
-{
-  static inline hashval_t hash (const htab_bb_copy_original_entry *);
-  static inline bool equal (const htab_bb_copy_original_entry *existing,
-   const htab_bb_copy_original_entry * candidate);
-};
-
-inline hashval_t
-bb_copy_hasher::hash (const htab_bb_copy_original_entry *data)
-{
-  return data->index1;
-}
-
-inline bool
-bb_copy_hasher::equal (const htab_bb_copy_original_entry *data,
-  const htab_bb_copy_original_entry *data2)
-{
-  return data->index1 == data2->index1;
-}
-
 /* Data structures used to maintain mapping between basic blocks and
copies.  */
-static hash_table *bb_original;
-static hash_table *bb_copy;
+typedef hash_map, int> copy_map_t;
+static copy_map_t *bb_original;
+static copy_map_t *bb_copy;
 
 /* And between loops and copies.  */
-static hash_table *loop_copy;
-static object_allocator *original_copy_bb_pool;
+static copy_map_t *loop_copy;
 
 /* Initialize the data structures to maintain mapping between blocks
and its copies.  */
 void
 initialize_original_copy_tables (void)
 {
-  original_copy_bb_pool = new object_allocator
-("original_copy");
-  bb_original = new hash_table (10);
-  bb_copy = new hash_table (10);
-  loop_copy = new hash_table (10);
+  bb_original = new copy_map_t (10);
+  bb_copy = new copy_map_t (10);
+  loop_copy = new copy_map_t (10);
 }
 
 /* Reset the data structures to maintain mapping between blocks and
@@ -1016,7 +984,6 @@ initialize_original_copy_tables (void)
 void
 reset_original_copy_tables (void)
 {
-  gcc_assert (original_copy_bb_pool);
   bb_original->empty ();
   bb_copy->empty ();
   loop_copy->empty ();
@@ -1027,15 +994,12 @@ reset_original_copy_tables (void)
 void
 free_original_copy_tables (void)
 {
-  gcc_assert (original_copy_bb_pool);
   delete bb_copy;
   bb_copy = NULL;
   delete bb_original;
   bb_original = NULL;
   delete loop_copy;
   loop_copy = NULL;
-  delete original_copy_bb_pool;
-  original_copy_bb_pool = NULL;
 }
 
 /* Return true iff we have had a call to initialize_original_copy_tables
@@ -1044,51 +1008,31 @@ free_original_copy_tables (void)
 bool
 original_copy_tables_initialized_p (void)
 {
-  return original_copy_bb_pool != NULL;
+  return bb_copy != NULL;
 }
 
 /* Removes the value associated with OBJ from table TAB.  */
 
 static void
-copy_original_table_clear (hash_table *tab, unsigned obj)
+copy_original_table_clear (copy_map_t *tab, unsigned obj)
 {
-  htab_bb_copy_original_entry **slot;
-  struct htab_bb_copy_original_entry key, *elt;
-
-  if (!original_copy_bb_pool)
+  if (!original_copy_tables_initialized_p ())
 return;
 
-  key.index1 = obj;
-  slot = tab->find_slot (, NO_INSERT);
-  if (!slot)
-return;
-
-  elt = *slot;
-  tab->clear_slot (slot);
-  original_copy_bb_pool->remove (elt);
+  tab->remove (obj);
 }
 
 /* Sets the value associated with OBJ in table TAB to VAL.
Do nothing when data structures are not initialized.  */
 
 static void
-copy_original_table_set (hash_table *tab,
+copy_original_table_set (copy_map_t *tab,
 unsigned obj, unsigned val)
 {
-  struct htab_bb_copy_original_entry **slot;
-  struct htab_bb_copy_original_entry key;
-
-  if (!original_copy_bb_pool)
+  if (!original_copy_tables_initialized_p ())
 return;
 
-  key.index1 = obj;
-  slot = 

Re: [PATCH] phiopt: Optimize x ? __builtin_clz (x) : 32 in GIMPLE [PR97503]

2020-10-21 Thread Richard Biener
On Wed, 21 Oct 2020, Jakub Jelinek wrote:

> Hi!
> 
> While we have at the RTL level noce_try_ifelse_collapse combined with
> simplify_cond_clz_ctz, that optimization doesn't always trigger because
> e.g. on powerpc there is an define_insn to compare a reg against zero and
> copy that register to another one and so we end up with a different pseudo
> in the simplify_cond_clz_ctz test and punt.
> 
> For targets that define C?Z_DEFINED_VALUE_AT_ZERO to 2 for certain modes,
> we can optimize it already in phiopt though, just need to ensure that
> we transform the __builtin_c?z* calls into .C?Z ifns because my recent
> VRP changes codified that the builtin calls are always undefined at zero,
> while ifns honor C?Z_DEFINED_VALUE_AT_ZERO equal to 2.
> And, in phiopt we already have popcount handling that does pretty much the
> same thing, except for always using a zero value rather than the one set
> by C?Z_DEFINED_VALUE_AT_ZERO.
> 
> So, this patch extends that function to handle not just popcount, but also
> clz and ctz.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> 
> 2020-10-20  Jakub Jelinek  
> 
>   PR tree-optimization/97503
>   * tree-ssa-phiopt.c (cond_removal_in_popcount_pattern): Rename to ...
>   (cond_removal_in_popcount_clz_ctz_pattern): ... this.  Handle not just
>   popcount, but also clz and ctz if it has C?Z_DEFINED_VALUE_AT_ZERO 2.
> 
>   * gcc.dg/tree-ssa/pr97503.c: New test.
> 
> --- gcc/tree-ssa-phiopt.c.jj  2020-07-28 15:39:10.075755306 +0200
> +++ gcc/tree-ssa-phiopt.c 2020-10-20 17:46:16.971329154 +0200
> @@ -61,8 +61,9 @@ static bool minmax_replacement (basic_bl
>   edge, edge, gimple *, tree, tree);
>  static bool abs_replacement (basic_block, basic_block,
>edge, edge, gimple *, tree, tree);
> -static bool cond_removal_in_popcount_pattern (basic_block, basic_block,
> -   edge, edge, gimple *, tree, tree);
> +static bool cond_removal_in_popcount_clz_ctz_pattern (basic_block, 
> basic_block,
> +   edge, edge, gimple *,
> +   tree, tree);
>  static bool cond_store_replacement (basic_block, basic_block, edge, edge,
>   hash_set *);
>  static bool cond_if_else_store_replacement (basic_block, basic_block, 
> basic_block);
> @@ -344,8 +345,9 @@ tree_ssa_phiopt_worker (bool do_store_el
> else if (abs_replacement (bb, bb1, e1, e2, phi, arg0, arg1))
>   cfgchanged = true;
> else if (!early_p
> -&& cond_removal_in_popcount_pattern (bb, bb1, e1, e2,
> - phi, arg0, arg1))
> +&& cond_removal_in_popcount_clz_ctz_pattern (bb, bb1, e1,
> + e2, phi, arg0,
> + arg1))
>   cfgchanged = true;
> else if (minmax_replacement (bb, bb1, e1, e2, phi, arg0, arg1))
>   cfgchanged = true;
> @@ -1777,16 +1779,20 @@ minmax_replacement (basic_block cond_bb,
>  
> 
> c_12 = PHI <_9(2)>
> -*/
> +
> +   Similarly for __builtin_clz or __builtin_ctz if
> +   C?Z_DEFINED_VALUE_AT_ZERO is 2, optab is present and
> +   instead of 0 above it uses the value from that macro.  */
>  
>  static bool
> -cond_removal_in_popcount_pattern (basic_block cond_bb, basic_block middle_bb,
> -   edge e1, edge e2,
> -   gimple *phi, tree arg0, tree arg1)
> +cond_removal_in_popcount_clz_ctz_pattern (basic_block cond_bb,
> +   basic_block middle_bb,
> +   edge e1, edge e2, gimple *phi,
> +   tree arg0, tree arg1)
>  {
>gimple *cond;
>gimple_stmt_iterator gsi, gsi_from;
> -  gimple *popcount;
> +  gimple *call;
>gimple *cast = NULL;
>tree lhs, arg;
>  
> @@ -1804,35 +1810,65 @@ cond_removal_in_popcount_pattern (basic_
>gsi_next_nondebug ();
>if (!gsi_end_p (gsi))
>  {
> -  popcount = gsi_stmt (gsi);
> +  call = gsi_stmt (gsi);
>gsi_next_nondebug ();
>if (!gsi_end_p (gsi))
>   return false;
>  }
>else
>  {
> -  popcount = cast;
> +  call = cast;
>cast = NULL;
>  }
>  
> -  /* Check that we have a popcount builtin.  */
> -  if (!is_gimple_call (popcount))
> +  /* Check that we have a popcount/clz/ctz builtin.  */
> +  if (!is_gimple_call (call) || gimple_call_num_args (call) != 1)
>  return false;
> -  combined_fn cfn = gimple_call_combined_fn (popcount);
> +
> +  arg = gimple_call_arg (call, 0);
> +  lhs = gimple_get_lhs (call);
> +
> +  if (lhs == NULL_TREE)
> +return false;
> +
> +  combined_fn cfn = gimple_call_combined_fn (call);
> +  internal_fn ifn = 

[PATCH] rs6000: Don't split constant operator add before reload, move to temp register for future optimization

2020-10-21 Thread Xionghu Luo via Gcc-patches
This is a revised version of the patch posted at
https://gcc.gnu.org/pipermail/gcc-patches/2020-March/542718.html, resend
this since this is a quite high priority performance issue for Power.

Don't split code from add3 for SDI to allow a later pass to split.
This allows later logic to hoist out constant load in add instructions.
In loop, lis+ori could be hoisted out to improve performance compared with
previous addis+addi (About 15% on typical case), weak point is
one more register is used and one more instruction is generated.  i.e.:

addis 3,3,0x6765
addi 3,3,0x4321

=>

lis 9,0x6765
ori 9,9,0x4321
add 3,3,9

Likewise, paddi is replaced with pli+add for Power10. No obvious performance
and binary size change to SPEC2017.

gcc/ChangeLog:

2020-10-21  Xiong Hu Luo  

* config/rs6000/rs6000.md (add3 for SDI): Don't split before 
reload,
move constant to temp register for add.

gcc/testsuite/ChangeLog:

2020-10-21  Xiong Hu Luo  

* gcc.target/powerpc/prefix-add.c: Check pli instead of paddi.
* gcc.target/powerpc/prefix-no-update.c: Likewise.
* gcc.target/powerpc/add-const.c: New test.
---
 gcc/config/rs6000/predicates.md   |  3 +-
 gcc/config/rs6000/rs6000.md   | 54 +++
 gcc/testsuite/gcc.target/powerpc/add-const.c  | 18 +++
 gcc/testsuite/gcc.target/powerpc/prefix-add.c |  4 +-
 .../gcc.target/powerpc/prefix-no-update.c |  2 +-
 5 files changed, 54 insertions(+), 27 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/add-const.c

diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index 4c2fe7fa312..af577da669e 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -859,8 +859,7 @@ (define_special_predicate "indexed_address_mem"
 (define_predicate "add_operand"
   (if_then_else (match_code "const_int")
 (match_test "satisfies_constraint_I (op)
-|| satisfies_constraint_L (op)
-|| satisfies_constraint_eI (op)")
+|| satisfies_constraint_L (op)")
 (match_operand 0 "gpc_reg_operand")))
 
 ;; Return 1 if the operand is either a non-special register, or 0, or -1.
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 779bfd11237..facf6e12114 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -1750,34 +1750,44 @@ (define_expand "add3"
 
   if (CONST_INT_P (operands[2]) && !add_operand (operands[2], mode))
 {
-  rtx tmp = ((!can_create_pseudo_p ()
- || rtx_equal_p (operands[0], operands[1]))
-? operands[0] : gen_reg_rtx (mode));
+  bool reg0 = (reg_or_subregno (operands[0]) == 0);
+  if (can_create_pseudo_p () || reg0)
+   {
+
+ rtx tmp = (!can_create_pseudo_p ()
+ || rtx_equal_p (operands[0], operands[1]))
+   ? operands[0] : gen_reg_rtx (mode);
 
   /* Adding a constant to r0 is not a valid insn, so use a different
-strategy in that case.  */
-  if (reg_or_subregno (operands[1]) == 0 || reg_or_subregno (tmp) == 0)
-   {
- if (operands[0] == operands[1])
-   FAIL;
- rs6000_emit_move (operands[0], operands[2], mode);
- emit_insn (gen_add3 (operands[0], operands[1], operands[0]));
+strategy in that case.  See stack-limit.c, need generate
+"24: %0:DI=0x20fa0; 25: %0:DI=%14:DI+%0:DI" in pro_and_epilogue
+when can_create_pseudo_p is false.  */
+ if (reg0 == 0 || reg_or_subregno (tmp) == 0)
+ {
+   if (operands[0] == operands[1])
+ FAIL;
+ }
+
+ rs6000_emit_move (tmp, operands[2], mode);
+ emit_insn (gen_add3 (operands[0], operands[1], tmp));
  DONE;
}
+  else
+   {
+ HOST_WIDE_INT val = INTVAL (operands[2]);
+ HOST_WIDE_INT low = ((val & 0x) ^ 0x8000) - 0x8000;
+ HOST_WIDE_INT rest = trunc_int_for_mode (val - low, mode);
 
-  HOST_WIDE_INT val = INTVAL (operands[2]);
-  HOST_WIDE_INT low = ((val & 0x) ^ 0x8000) - 0x8000;
-  HOST_WIDE_INT rest = trunc_int_for_mode (val - low, mode);
-
-  if (mode == DImode && !satisfies_constraint_L (GEN_INT (rest)))
-   FAIL;
+ if (mode == DImode && !satisfies_constraint_L (GEN_INT (rest)))
+   FAIL;
 
-  /* The ordering here is important for the prolog expander.
-When space is allocated from the stack, adding 'low' first may
-produce a temporary deallocation (which would be bad).  */
-  emit_insn (gen_add3 (tmp, operands[1], GEN_INT (rest)));
-  emit_insn (gen_add3 (operands[0], tmp, GEN_INT (low)));
-  DONE;
+ /* The ordering here is important for the prolog expander.
+When space is allocated from the stack, adding 'low' first may
+produce a temporary deallocation (which would be bad).  */
+ emit_insn (gen_add3 (operands[0], operands[1], GEN_INT 

Re: [committed][nvptx] Remove -m32

2020-10-21 Thread Tom de Vries
On 10/20/20 3:48 PM, Tobias Burnus wrote:
> On 10/15/20 3:26 PM, Tom de Vries wrote:
>>   PR target/97436
>>   * config/nvptx/nvptx.opt (m32): Comment out.
>>   * doc/invoke.texi (NVPTX options): Remove -m32.
> 
> This caused the warning: doc/invoke.texi:25617: warning: @itemx should
> not begin @table
> 
> Fixed by the committed attached patch.

Thanks for fixing this.

- Tom


  1   2   >