Re: [PATCH] [PR rtl-optimization/97249]Simplify vec_select of paradoxical subreg.

2020-10-18 Thread Hongtao Liu via Gcc-patches
On Thu, Oct 15, 2020 at 8:38 PM Richard Sandiford
 wrote:
>
> Hongtao Liu via Gcc-patches  writes:
> > +   /* Simplify vec_select of a subreg of X to just a vec_select of X
> > +  when X has same component mode as vec_select.  */
> > +   int l2;
> > +   if (GET_CODE (trueop0) == SUBREG
> > +   && GET_MODE_INNER (mode)
> > +  == GET_MODE_INNER (GET_MODE (XEXP (trueop0, 0)))
>
> Better to use SUBREG_REG here and below.
>

Yes and changed.

> > +   && (GET_MODE_NUNITS (GET_MODE (trueop0))).is_constant ()
> > +   && (GET_MODE_NUNITS (mode)).is_constant ()
> > +   && (GET_MODE_NUNITS (GET_MODE (XEXP (trueop0, 0
> > +   .is_constant ()
> > +   && known_le (l1, l2))
> > + {
> > +   unsigned HOST_WIDE_INT subreg_offset = 0;
> > +   gcc_assert (known_eq (XVECLEN (trueop1, 0), l1));
> > +   gcc_assert (can_div_trunc_p (exact_div (subreg_lsb (trueop0), 
> > BITS_PER_UNIT),
> > +GET_MODE_SIZE (GET_MODE_INNER 
> > (mode)),
> > +_offset));
>
> can_div_trunc_p discards the remainder, whereas it looks like here
> you want an exact multiple.
>
> I don't think it's absolutely guaranteed that the “if” condition makes
> the division by GET_MODE_SIZE exact.  E.g. in principle you could have
> a subreg of a vector of TIs in which the subreg offset is misaligned by
> a DI offset.
>
> I'm not sure the subreg_lsb conversion is correct though.  On big-endian
> targets, lane numbering follows memory layout, just like subreg byte
> offsets do.  So ISTM that using SUBREG_BYTE (as per the earlier patch)
> was correct.
>
> In summary, I think the "if” condition should include something like:
>
>   constant_mulitple_p (SUBREG_BYTE (trueop0),
>GET_MODE_UNIT_BITSIZE (mode),
>_offset)
>

Changed.

> Thanks,
> Richard


Update patch.

-- 
BR,
Hongtao
From 8d154067963e453c337e6dc2c4f3f19bf0d6e11b Mon Sep 17 00:00:00 2001
From: liuhongt 
Date: Tue, 13 Oct 2020 15:35:29 +0800
Subject: [PATCH] Simplify vec_select of a subreg of X to just a vec_select of
 X.

gcc/ChangeLog
	PR rtl-optimization/97249
	* simplify-rtx.c (simplify_binary_operation_1): Simplify
	vec_select of a subreg of X to a vec_select of X.

gcc/testsuite/ChangeLog

	* gcc.target/i386/pr97249-1.c: New test.
---
 gcc/simplify-rtx.c| 44 +++
 gcc/testsuite/gcc.target/i386/pr97249-1.c | 30 
 2 files changed, 74 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr97249-1.c

diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
index 869f0d11b2e..b1009837b2b 100644
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -4170,6 +4170,50 @@ simplify_binary_operation_1 (enum rtx_code code, machine_mode mode,
 		return subop1;
 		}
 	}
+
+	  /* Simplify vec_select of a subreg of X to just a vec_select of X
+	 when X has same component mode as vec_select.  */
+	  int l2;
+	  unsigned HOST_WIDE_INT subreg_offset = 0;
+	  if (GET_CODE (trueop0) == SUBREG
+	  && GET_MODE_INNER (mode)
+		 == GET_MODE_INNER (GET_MODE (SUBREG_REG (trueop0)))
+	  && (GET_MODE_NUNITS (GET_MODE (trueop0))).is_constant ()
+	  && (GET_MODE_NUNITS (mode)).is_constant ()
+	  && (GET_MODE_NUNITS (GET_MODE (SUBREG_REG (trueop0
+		  .is_constant ()
+	  && known_le (l1, l2)
+	  && constant_multiple_p (SUBREG_BYTE (trueop0),
+  GET_MODE_UNIT_BITSIZE (mode),
+  _offset))
+	{
+
+	  gcc_assert (known_eq (XVECLEN (trueop1, 0), l1));
+	  bool success = true;
+	  for (int i = 0; i != l1; i++)
+		{
+		  rtx idx  = XVECEXP (trueop1, 0, i);
+		  if (!CONST_INT_P (idx))
+		{
+		  success = false;
+		  break;
+		}
+		}
+	  if (success)
+		{
+		  rtx par = trueop1;
+		  if (subreg_offset)
+		{
+		  rtvec vec = rtvec_alloc (l1);
+		  for (int i = 0; i < l1; i++)
+			RTVEC_ELT (vec, i)
+			  = GEN_INT (INTVAL (XVECEXP (trueop1, 0, i)
+	 + subreg_offset));
+		  par = gen_rtx_PARALLEL (VOIDmode, vec);
+		}
+		  return gen_rtx_VEC_SELECT (mode, SUBREG_REG (trueop0), par);
+		}
+	}
 	}
 
   if (XVECLEN (trueop1, 0) == 1
diff --git a/gcc/testsuite/gcc.target/i386/pr97249-1.c b/gcc/testsuite/gcc.target/i386/pr97249-1.c
new file mode 100644
index 000..4478a34a9f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr97249-1.c
@@ -0,0 +1,30 @@
+/* PR target/97249  */
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O3 -masm=att" } */
+/* { dg-final { scan-assembler-times {(?n)vpmovzxbw[ \t]+\(.*%xmm[0-9]} 2 } } */
+/* { dg-final { scan-assembler-times {(?n)vpmovzxwd[ \t]+\(.*%xmm[0-9]} 2 } } */
+/* { dg-final { scan-assembler-times {(?n)vpmovzxdq[ \t]+\(.*%xmm[0-9]} 2 } } */
+
+void
+foo (unsigned char* p1, unsigned char* p2, short* __restrict p3)
+{
+for (int i = 0 ; i != 8; i++)
+ p3[i] = 

[Ada,FYI] revamp ada.numerics.aux

2020-10-18 Thread Alexandre Oliva


Instead of mapping elementary functions for all types to a single
type, use the intrinsics available for the various base types.

A new Ada.Numerics.Aux_Generic_Float is introduced to explicitly
dispatch, based on the 'Digits attribute of the base type, to the
various newly-added Aux_Short_Float, Aux_Float, Aux_Long_Float, or
Aux_Long_Long_Float.

The Aux_Short_Float unit is implemented in terms of the Aux_Float one,
and the others rely on the elementary functions from the C Math
library for float, double and long double types, respectively.

An Aux_Linker_Options is added, and units that import intrinsics from
libm/libc depend on it to provide the "-lm" linker option if needed.
The option is provided by default, but there is an alternate version
that doesn't, that is used for vxworks targets.

The Aux variant that used to open-code Sin and Cos for the ancient
ppc-darwin, because of insufficient precision in libc, is dropped,
along with the alternate dummy body for Aux.  Both are presumed no
longer needed.

The original Ada.Numerics.Aux is retained, for backward compatibility,
as a wrapper for a newly-added Aux_Compat, that renames
Aux_Long_Float, except on x86, in which an alternate version renames
Aux_Long_Long_Float.

Generic_Elementary_Functions and Generic_Complex_Types are adjusted to
use Aux_Generic_Float, avoiding the type conversions and inefficiencies of
computing results in higher precision than requested.

Generic_Complex_Elementary_Functions is adjusted to enable an
additional instance of the sincos optimization, even without -gnatn.

Regstrapped on x86_64-linux-gnu, also tested wiht various other
targets.  Approved by Arno.  Installing.


for  gcc/ada/ChangeLog

* Makefile.rtl (GNATRTL_NONTASKING_OBJS): Compile Ada.Numerics
child units Aux_Generic_Float, Aux_Long_Long_Float, Aux_Long_Float,
Aux_Float, Aux_Short_Float, Aux_Compat, and Aux_Linker_Options.
(X86_TARGET_PAIRS): Drop dummy body for Aux.  Use x86 version
of Aux_Compat.
(X86_64_TARGET_PAIRS): Likewise.
(LIBGNAT_TARGET_PAIRS): On VxWorks, select the nolibm
variants.  Drop the darwin version of Aux.  Drop the redundant
libc-x86 numaux variants on x86* kfreebsd variants.
* libgnat/a-nagefl.ads: New Aux_Generic_Float.
* libgnat/a-naliop.ads: New Aux_Linker_Options.
* libgnat/a-naliop__nolibm.ads: New.
* libgnat/a-nallfl.ads: New Aux_Long_Long_Float.
* libgnat/a-nalofl.ads: New Aux_Long_Float.
* libgnat/a-nuaufl.ads: New Aux_Float.
* libgnat/a-nashfl.ads: New Aux_Short_Float.
* libgnat/a-ngcefu.ads (Exp): Factor out the Im (X) passed to
Sin and Cos in the Complex variant too.
* libgnat/a-ngcoty.adb: Switch to Aux_Generic_Float.  Drop
redundant conversions.
* libgnat/a-ngelfu.adb: Likewise.
* libgnat/a-nuauco.ads: New Aux_Compat.
* libgnat/a-nuauco__x86.ads: New.
* libgnat/a-numaux.ads: Replace with Compat wrapper.
* libgnat/a-numaux__darwin.adb: Remove.
* libgnat/a-numaux__darwin.ads: Remove.
* libgnat/a-numaux__dummy.adb: Remove.
* libgnat/a-numaux__libc-x86.ads: Remove.
* libgnat/a-numaux__vxworks.ads: Remove.
---
 gcc/ada/Makefile.rtl   |   27 ++--
 gcc/ada/libgnat/a-nagefl.ads   |  171 ++
 gcc/ada/libgnat/a-naliop.ads   |   45 +++
 gcc/ada/libgnat/a-naliop__nolibm.ads   |   43 +++
 gcc/ada/libgnat/a-nallfl.ads   |   87 +
 gcc/ada/libgnat/a-nalofl.ads   |   87 +
 gcc/ada/libgnat/a-nashfl.ads   |   87 +
 gcc/ada/libgnat/a-ngcefu.adb   |5 -
 gcc/ada/libgnat/a-ngcoty.adb   |   34 +++--
 gcc/ada/libgnat/a-ngelfu.adb   |   39 +++---
 gcc/ada/libgnat/a-nuauco.ads   |   40 ++
 gcc/ada/libgnat/a-nuauco__x86.ads  |   39 ++
 gcc/ada/libgnat/a-nuaufl.ads   |   87 +
 gcc/ada/libgnat/a-numaux.ads   |   94 +-
 gcc/ada/libgnat/a-numaux__darwin.adb   |  211 
 gcc/ada/libgnat/a-numaux__darwin.ads   |  103 
 gcc/ada/libgnat/a-numaux__dummy.adb|   32 -
 gcc/ada/libgnat/a-numaux__libc-x86.ads |   97 ---
 gcc/ada/libgnat/a-numaux__vxworks.ads  |   97 ---
 19 files changed, 773 insertions(+), 652 deletions(-)
 create mode 100644 gcc/ada/libgnat/a-nagefl.ads
 create mode 100644 gcc/ada/libgnat/a-naliop.ads
 create mode 100644 gcc/ada/libgnat/a-naliop__nolibm.ads
 create mode 100644 gcc/ada/libgnat/a-nallfl.ads
 create mode 100644 gcc/ada/libgnat/a-nalofl.ads
 create mode 100644 gcc/ada/libgnat/a-nashfl.ads
 create mode 100644 gcc/ada/libgnat/a-nuauco.ads
 create mode 100644 gcc/ada/libgnat/a-nuauco__x86.ads
 create mode 100644 gcc/ada/libgnat/a-nuaufl.ads
 delete mode 100644 gcc/ada/libgnat/a-numaux__darwin.adb
 delete mode 100644 

[PATCH] coroutines: Emit error for invalid promise return types [PR97438].

2020-10-18 Thread Iain Sandoe
Hi,

At one stage, use cases were proposed for allowing the promise
type to contain both return_value and return_void.  That was
not accepted into C++20; so we should reject it as per the PR.

Tested on x86_64-darwin, x86_64-linux-gnu,
OK for master?

(although this is technically an ‘accepts invalid’ not sure it’s serious
 enough to backport to 10.x - but I’m happy to do so if anyone thinks
 it should be).

thanks
Iain

gcc/cp/ChangeLog:

PR c++/97438
* coroutines.cc (struct coroutine_info): Add a field to
record that we emitted a promise type error.
(coro_promise_type_found_p): Check for the case that the
promise type contains both return_void and return_value.
Emit an error if so, with information about the wrong
type methods.

gcc/testsuite/ChangeLog:

PR c++/97438
* g++.dg/coroutines/pr97438.C: New test.
---
 gcc/cp/coroutines.cc  | 25 +++
 gcc/testsuite/g++.dg/coroutines/pr97438.C | 30 +++
 2 files changed, 55 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/coroutines/pr97438.C

diff --git a/gcc/cp/coroutines.cc b/gcc/cp/coroutines.cc
index ba813454a0b..9b9141e51fd 100644
--- a/gcc/cp/coroutines.cc
+++ b/gcc/cp/coroutines.cc
@@ -94,6 +94,7 @@ struct GTY((for_user)) coroutine_info
   /* Flags to avoid repeated errors for per-function issues.  */
   bool coro_ret_type_error_emitted;
   bool coro_promise_error_emitted;
+  bool coro_co_return_error_emitted;
 };
 
 struct coroutine_info_hasher : ggc_ptr_hash
@@ -470,6 +471,30 @@ coro_promise_type_found_p (tree fndecl, location_t loc)
  return false;
}
 
+  /* Test for errors in the promise type that can be determined now.  */
+  tree has_ret_void = lookup_member (coro_info->promise_type,
+coro_return_void_identifier,
+/*protect=*/1, /*want_type=*/0,
+tf_none);
+  tree has_ret_val = lookup_member (coro_info->promise_type,
+   coro_return_value_identifier,
+   /*protect=*/1, /*want_type=*/0,
+   tf_none);
+  if (has_ret_void && has_ret_val)
+   {
+ location_t ploc = DECL_SOURCE_LOCATION (fndecl);
+ if (!coro_info->coro_co_return_error_emitted)
+   error_at (ploc, "the coroutine promise type %qT declares both"
+ " % and %",
+ coro_info->promise_type);
+ inform (DECL_SOURCE_LOCATION (BASELINK_FUNCTIONS (has_ret_void)),
+ "% declared here");
+ inform (DECL_SOURCE_LOCATION (BASELINK_FUNCTIONS (has_ret_val)),
+ "% declared here");
+ coro_info->coro_co_return_error_emitted = true;
+ return false;
+   }
+
   /* Try to find the handle type for the promise.  */
   tree handle_type =
instantiate_coro_handle_for_promise_type (loc, coro_info->promise_type);
diff --git a/gcc/testsuite/g++.dg/coroutines/pr97438.C 
b/gcc/testsuite/g++.dg/coroutines/pr97438.C
new file mode 100644
index 000..95376648ed7
--- /dev/null
+++ b/gcc/testsuite/g++.dg/coroutines/pr97438.C
@@ -0,0 +1,30 @@
+#if __has_include()
+#include 
+#else
+#include 
+namespace std { using namespace experimental; }
+#endif
+
+struct dummy_coroutine {};
+
+namespace std {
+
+template<>
+class coroutine_traits<::dummy_coroutine> {
+public:
+struct promise_type {
+void return_value(int x) {  }
+void return_void() {}
+std::suspend_never initial_suspend() noexcept { return {}; }
+std::suspend_never final_suspend() noexcept { return {}; }
+dummy_coroutine get_return_object() { return {}; }
+void unhandled_exception() {}
+};
+};
+
+}
+
+dummy_coroutine
+foo() { // { dg-error {the coroutine promise type 
'std::__n4861::coroutine_traits::promise_type' declares both 
'return_value' and 'return_void'} }
+co_return 17;
+}
-- 
2.24.1



[pushed] libsanitizer, Darwin, Bootstrap : Fix bootstrap on Darwin <= 15.

2020-10-18 Thread Iain Sandoe
Hi

The latest upstream merge for libsanitizer introduces code that makes
use of some macro values that are not available in SDKs for versions
of Darwin <= 15 (macOS 10.11).

[TBH, I am a bit surprised by this, I was under the impression that upstream
 supported versions back to Darwin11 / macOS 10.7]

Add definitions for these where they are not present.

tested on Darwin9 -> Darwin19 and x86_64-linux
pushed to master
thanks
Iain

P.S. I will also add this change to LOCAL_PATCHES.

libsanitizer/ChangeLog:

* sanitizer_common/sanitizer_mac.h: Ensure that TARGET_OS_
macros are defined where the macOS SDK does not contain
them.
(TARGET_OS_OSX, TARGET_OS_IOS, TARGET_OS_TV, TARGET_OS_WATCH):
Define where needed.
---
 libsanitizer/sanitizer_common/sanitizer_mac.h | 20 +++
 1 file changed, 20 insertions(+)

diff --git a/libsanitizer/sanitizer_common/sanitizer_mac.h 
b/libsanitizer/sanitizer_common/sanitizer_mac.h
index 023071e4f11..a2c42b3bf4f 100644
--- a/libsanitizer/sanitizer_common/sanitizer_mac.h
+++ b/libsanitizer/sanitizer_common/sanitizer_mac.h
@@ -14,6 +14,26 @@
 
 #include "sanitizer_common.h"
 #include "sanitizer_platform.h"
+
+/* TARGET_OS_OSX is not present in SDKs before Darwin16 (macOS 10.12) use
+   TARGET_OS_MAC (we have no support for iOS in any form for these versions,
+   so there's no ambiguity).  */
+#if !defined(TARGET_OS_OSX) && TARGET_OS_MAC
+# define TARGET_OS_OSX 1
+#endif
+
+/* Other TARGET_OS_xxx are not present on earlier versions, define them to
+   0 (we have no support for them; they are not valid targets anyway).  */
+#ifndef TARGET_OS_IOS
+#define TARGET_OS_IOS 0
+#endif
+#ifndef TARGET_OS_TV
+#define TARGET_OS_TV 0
+#endif
+#ifndef TARGET_OS_WATCH
+#define TARGET_OS_WATCH 0
+#endif
+
 #if SANITIZER_MAC
 #include "sanitizer_posix.h"
 
-- 
2.24.1



[PATCH] IBM Z: Emit vector alignment hints for strlen

2020-10-18 Thread Stefan Schulze Frielinghaus via Gcc-patches
In case the vectorized version of strlen is used, then each memory
access inside the loop is 16-byte aligned.  Thus add this kind of
information so that vector alignment hints can later on be emitted.

Bootstrapped and regtested on IBM Z.  Ok for master?

gcc/ChangeLog:

* config/s390/s390.c (s390_expand_vec_strlen): Add alignment
for memory access inside loop.
---
 gcc/config/s390/s390.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index dbb541bbea7..f9b27f96fd7 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -5955,6 +5955,7 @@ s390_expand_vec_strlen (rtx target, rtx string, rtx 
alignment)
   rtx temp;
   rtx len = gen_reg_rtx (QImode);
   rtx cond;
+  rtx mem;
 
   s390_load_address (str_addr_base_reg, XEXP (string, 0));
   emit_move_insn (str_idx_reg, const0_rtx);
@@ -5996,10 +5997,10 @@ s390_expand_vec_strlen (rtx target, rtx string, rtx 
alignment)
   LABEL_NUSES (loop_start_label) = 1;
 
   /* Load 16 bytes of the string into VR.  */
-  emit_move_insn (str_reg,
- gen_rtx_MEM (V16QImode,
-  gen_rtx_PLUS (Pmode, str_idx_reg,
-str_addr_base_reg)));
+  mem = gen_rtx_MEM (V16QImode,
+gen_rtx_PLUS (Pmode, str_idx_reg, str_addr_base_reg));
+  set_mem_align (mem, 128);
+  emit_move_insn (str_reg, mem);
   if (into_loop_label != NULL_RTX)
 {
   emit_label (into_loop_label);
-- 
2.25.3



Re: PING^3 [PATCH] x86: Add cmpmemsi for -minline-all-stringops

2020-10-18 Thread H.J. Lu via Gcc-patches
On Sun, Oct 18, 2020 at 8:16 AM Jan Hubicka  wrote:
>
> > On Fri, Oct 2, 2020 at 6:21 AM H.J. Lu  wrote:
> > >
> > > On Wed, Sep 16, 2020 at 10:07 PM H.J. Lu  wrote:
> > > >
> > > > On Wed, Aug 19, 2020 at 6:09 AM H.J. Lu  wrote:
> > > > >
> > > > > On Tue, May 19, 2020 at 5:14 AM H.J. Lu  wrote:
> > > > > >
> > > > > > On Tue, May 19, 2020 at 1:48 AM Uros Bizjak  
> > > > > > wrote:
> > > > > > >
> > > > > > > On Sun, May 17, 2020 at 7:06 PM H.J. Lu  
> > > > > > > wrote:
> > > > > > > >
> > > > > > > > Duplicate the cmpstrn pattern for cmpmem.  The only difference 
> > > > > > > > is that
> > > > > > > > the length argument of cmpmem is guaranteed to be less than or 
> > > > > > > > equal to
> > > > > > > > lengths of 2 memory areas.  Since "repz cmpsb" can be much 
> > > > > > > > slower than
> > > > > > > > memcmp function implemented with vector instruction, see
> > > > > > > >
> > > > > > > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
> > > > > > > >
> > > > > > > > expand cmpmem to "repz cmpsb" only with -mgeneral-regs-only.
> > > > > > >
> > > > > > > If there is no benefit compared to the library implementation, 
> > > > > > > then
> > > > > > > enable these patterns only when -minline-all-stringops is used.
> > > > > >
> > > > > > Fixed.
> > > > > >
> > > > > > > Eventually these should be reimplemented with SSE4 string 
> > > > > > > instructions.
> > > > > > >
> > > > > > > Honza is the author of the block handling x86 system, I'll leave 
> > > > > > > the
> > > > > > > review to him.
> > > > > >
> > > > > > We used to expand memcmp to "repz cmpsb" via cmpstrnsi.  It was 
> > > > > > changed
> > > > > > by
> > > > > >
> > > > > > commit 9b0f6f5e511ca512e4faeabc81d2fd3abad9b02f
> > > > > > Author: Nick Clifton 
> > > > > > Date:   Fri Aug 12 16:26:11 2011 +
> > > > > >
> > > > > > builtins.c (expand_builtin_memcmp): Do not use cmpstrnsi 
> > > > > > pattern.
> > > > > >
> > > > > > * builtins.c (expand_builtin_memcmp): Do not use 
> > > > > > cmpstrnsi
> > > > > > pattern.
> > > > > > * doc/md.texi (cmpstrn): Note that the comparison stops 
> > > > > > if both
> > > > > > fetched bytes are zero.
> > > > > > (cmpstr): Likewise.
> > > > > > (cmpmem): Note that the comparison does not stop if 
> > > > > > both of the
> > > > > > fetched bytes are zero.
> > > > > >
> > > > > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95151
> > > > > >
> > > > > > is a regression.
> > > > > >
> > > > > > Honza, can you take a look at this?
> > > > > >
> > > > >
> > > > > PING:
> > > > >
> > > > > https://gcc.gnu.org/pipermail/gcc-patches/2020-May/546921.html
> > > > >
> > > >
> > > > PING.
> > > >
> > >
> > > PING.
> > >
> >
> > I'd like to check it in next Tuesday if there are no comments.
>
> I still plan to intorduce the two-level optimize_for_size predicates.
> Will try to do that by tuesday.
>

Thanks.

BTW, this patch is about inlining memcmp with -minline-all-stringops.
It is very important for user interrupt codes (UINTR) not to call memcmp
since memcmp in glibc uses vector registers which shouldn't be used in
user interrupt codes.

-- 
H.J.


Re: PING^3 [PATCH] x86: Add cmpmemsi for -minline-all-stringops

2020-10-18 Thread Jan Hubicka
> On Fri, Oct 2, 2020 at 6:21 AM H.J. Lu  wrote:
> >
> > On Wed, Sep 16, 2020 at 10:07 PM H.J. Lu  wrote:
> > >
> > > On Wed, Aug 19, 2020 at 6:09 AM H.J. Lu  wrote:
> > > >
> > > > On Tue, May 19, 2020 at 5:14 AM H.J. Lu  wrote:
> > > > >
> > > > > On Tue, May 19, 2020 at 1:48 AM Uros Bizjak  wrote:
> > > > > >
> > > > > > On Sun, May 17, 2020 at 7:06 PM H.J. Lu  wrote:
> > > > > > >
> > > > > > > Duplicate the cmpstrn pattern for cmpmem.  The only difference is 
> > > > > > > that
> > > > > > > the length argument of cmpmem is guaranteed to be less than or 
> > > > > > > equal to
> > > > > > > lengths of 2 memory areas.  Since "repz cmpsb" can be much slower 
> > > > > > > than
> > > > > > > memcmp function implemented with vector instruction, see
> > > > > > >
> > > > > > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
> > > > > > >
> > > > > > > expand cmpmem to "repz cmpsb" only with -mgeneral-regs-only.
> > > > > >
> > > > > > If there is no benefit compared to the library implementation, then
> > > > > > enable these patterns only when -minline-all-stringops is used.
> > > > >
> > > > > Fixed.
> > > > >
> > > > > > Eventually these should be reimplemented with SSE4 string 
> > > > > > instructions.
> > > > > >
> > > > > > Honza is the author of the block handling x86 system, I'll leave the
> > > > > > review to him.
> > > > >
> > > > > We used to expand memcmp to "repz cmpsb" via cmpstrnsi.  It was 
> > > > > changed
> > > > > by
> > > > >
> > > > > commit 9b0f6f5e511ca512e4faeabc81d2fd3abad9b02f
> > > > > Author: Nick Clifton 
> > > > > Date:   Fri Aug 12 16:26:11 2011 +
> > > > >
> > > > > builtins.c (expand_builtin_memcmp): Do not use cmpstrnsi pattern.
> > > > >
> > > > > * builtins.c (expand_builtin_memcmp): Do not use cmpstrnsi
> > > > > pattern.
> > > > > * doc/md.texi (cmpstrn): Note that the comparison stops 
> > > > > if both
> > > > > fetched bytes are zero.
> > > > > (cmpstr): Likewise.
> > > > > (cmpmem): Note that the comparison does not stop if both 
> > > > > of the
> > > > > fetched bytes are zero.
> > > > >
> > > > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95151
> > > > >
> > > > > is a regression.
> > > > >
> > > > > Honza, can you take a look at this?
> > > > >
> > > >
> > > > PING:
> > > >
> > > > https://gcc.gnu.org/pipermail/gcc-patches/2020-May/546921.html
> > > >
> > >
> > > PING.
> > >
> >
> > PING.
> >
> 
> I'd like to check it in next Tuesday if there are no comments.

I still plan to intorduce the two-level optimize_for_size predicates.
Will try to do that by tuesday.

Honza
> 
> -- 
> H.J.


Re: *PING* [PATCH] PR libfortran/97063 - Wrong result for vector (step size is negative) * matrix

2020-10-18 Thread Thomas Koenig via Gcc-patches

Hello Harald,


Early *ping*.



OK for master?  And backport to all open branches where it applies?


OK for both.

Thanks a lot for the patch!

Best regards

Thomas


[PATCH] arm: Fix base register when loading canary address with -msingle-pic-base [PR96828]

2020-10-18 Thread ilya Oleinik via Gcc-patches
After PR85434 code generation code with -fstack-protector and
-msingle-pic-base produces
canary value load with wrong pic base register (e.g. r3 instead of r9). On
thumb1 targets
mov insn needed to allow the base register to be used was missing.

'compute_now' in function require_pic_register doesn't make sense, because
it prevents
mov insn generation at pic register reload. So it was removed.
In  set/test insn checks are added to select the correct pic base register
for current pic
mode.

Tested on arm-none-eabi for thumb1, thumb2, arm.
OK for trunk?

2020-10-18  Oleinik Ilya  

* config/arm/arm.c (arm_option_override): Forbid switching base
register for FDPIC.
(require_pic_register): Restore mov insn generation for Thumb1.
* config/arm/arm.md (*stack_protect_combined_set_insn): Use
correct base register for PIC if -msingle-pic-base is set.
(*stack_protect_combined_test_insn): Likewise.

---
 gcc/config/arm/arm.c  |  5 ++---
 gcc/config/arm/arm.md | 24 ++--
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 8105b39e7..23139f207 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -3560,7 +3560,7 @@ arm_option_override (void)
|| pic_register == HARD_FRAME_POINTER_REGNUM
|| pic_register == STACK_POINTER_REGNUM
|| pic_register >= PC_REGNUM
-   || (TARGET_VXWORKS_RTP
+   || ((TARGET_VXWORKS_RTP || TARGET_FDPIC)
&& (unsigned int) pic_register != arm_pic_register))
  error ("unable to use %qs for PIC register", arm_pic_register_string);
   else
@@ -7858,8 +7858,7 @@ require_pic_register (rtx pic_reg, bool compute_now)
   start_sequence ();

   if (TARGET_THUMB1 && arm_pic_register != INVALID_REGNUM
-  && arm_pic_register > LAST_LO_REGNUM
-  && !compute_now)
+  && arm_pic_register > LAST_LO_REGNUM)
  emit_move_insn (cfun->machine->pic_reg,
  gen_rtx_REG (Pmode, arm_pic_register));
   else
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 147c4a50c..c7934275d 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -9199,10 +9199,16 @@
 {
   rtx pic_reg;

-  if (TARGET_FDPIC)
-  pic_reg = gen_rtx_REG (Pmode, FDPIC_REGNUM);
+  if (TARGET_FDPIC || TARGET_SINGLE_PIC_BASE)
+ {
+  if (!TARGET_THUMB1)
+pic_reg = gen_rtx_REG (Pmode, arm_pic_register);
+  else
+pic_reg = cfun->machine->pic_reg
+  ? cfun->machine->pic_reg : operands[3];
+ }
   else
-  pic_reg = operands[3];
+ pic_reg = operands[3];

   /* Forces recomputing of GOT base now.  */
   legitimize_pic_address (operands[1], SImode, operands[2], pic_reg,
@@ -9282,10 +9288,16 @@
 {
   rtx pic_reg;

-  if (TARGET_FDPIC)
-  pic_reg = gen_rtx_REG (Pmode, FDPIC_REGNUM);
+  if (TARGET_FDPIC || TARGET_SINGLE_PIC_BASE)
+ {
+  if (!TARGET_THUMB1)
+pic_reg = gen_rtx_REG (Pmode, arm_pic_register);
+  else
+pic_reg = cfun->machine->pic_reg
+  ? cfun->machine->pic_reg : operands[4];
+ }
   else
-  pic_reg = operands[4];
+ pic_reg = operands[4];

   /* Forces recomputing of GOT base now.  */
   legitimize_pic_address (operands[1], SImode, operands[3], pic_reg,