c-c++-common/builtin-has-attribute-4.c: Require visibility.

2020-07-22 Thread Hans-Peter Nilsson
Another missed attribute-visibility-requirement, causing a failure for
e.g. mmix-knuth-mmixware.  Committed as obvious.

gcc/testsuite:
* c-c++-common/builtin-has-attribute-4.c: Require visibility.

--- gcc/gcc/testsuite/c-c++-common/builtin-has-attribute-4.c.orig   Mon Jan 
13 22:30:46 2020
+++ gcc/gcc/testsuite/c-c++-common/builtin-has-attribute-4.cThu Jul 23 
04:07:45 2020
@@ -4,6 +4,7 @@
{ dg-options "-Wall -ftrack-macro-expansion=0" }
{ dg-options "-Wall -Wno-narrowing -Wno-unused -ftrack-macro-expansion=0" { 
target c++ } }
{ dg-additional-options "-DSKIP_ALIAS" { target *-*-darwin* } }
+   { dg-require-visibility "hidden" }
 */

 #define ATTR(...) __attribute__ ((__VA_ARGS__))


[PATCH] fix off-by-one mistake in -Warray-bounds for major bounds (PR 84079)

2020-07-22 Thread Martin Sebor via Gcc-patches

-Warray-bounds fails to trigger when taking the address of an element
of a multi-dimensional array at an index that's equal to the bound of
one of the higher dimensions of the array.  The attached simple patch
corrects this shortcoming.  I will commit it tomorrow unless there are
suggestions for changes.

Tested on x86_64-linux.

Martin
PR tree-optimization/84079 - missing -Warray-bounds taking the address of past-the-end element of a multidimensional array

gcc/ChangeLog:

	PR tree-optimization/84079
	* gimple-array-bounds.cc (array_bounds_checker::check_addr_expr):

gcc/testsuite/ChangeLog:

	PR tree-optimization/84079
	* gcc.dg/Warray-bounds-62.c: New test.


diff --git a/gcc/gimple-array-bounds.cc b/gcc/gimple-array-bounds.cc
index 352d0745178..613bd612139 100644
--- a/gcc/gimple-array-bounds.cc
+++ b/gcc/gimple-array-bounds.cc
@@ -519,14 +519,21 @@ array_bounds_checker::check_mem_ref (location_t location, tree ref,
 void
 array_bounds_checker::check_addr_expr (location_t location, tree t)
 {
+  /* For the rightmost subscript only, accept taking the address of
+ the just-past-the-end element.  */
+  bool ignore_off_by_one = true;
+
   /* Check each ARRAY_REF and MEM_REF in the reference chain. */
   do
 {
   bool warned = false;
   if (TREE_CODE (t) == ARRAY_REF)
-	warned = check_array_ref (location, t, true /*ignore_off_by_one*/);
+	{
+	  warned = check_array_ref (location, t, ignore_off_by_one);
+	  ignore_off_by_one = false;
+	}
   else if (TREE_CODE (t) == MEM_REF)
-	warned = check_mem_ref (location, t, true /*ignore_off_by_one*/);
+	warned = check_mem_ref (location, t, ignore_off_by_one);
 
   if (warned)
 	TREE_NO_WARNING (t) = true;
diff --git a/gcc/testsuite/gcc.dg/Warray-bounds-62.c b/gcc/testsuite/gcc.dg/Warray-bounds-62.c
new file mode 100644
index 000..c2421aac1b2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/Warray-bounds-62.c
@@ -0,0 +1,130 @@
+/* PR tree-optimization/84079 - missing -Warray-bounds taking the address
+   of past-the-end element of a multidimensional array
+   { dg-do compile }
+   { dg-options "-O2 -Wall -ftrack-macro-expansion=0" } */
+
+void sink (int, ...);
+
+#define T(type, dims, inxs)			\
+  do {		\
+type a dims;\
+sink (__LINE__,  inxs);			\
+  } while (0)
+
+
+void test_char_1_1 (int i0, int i1, int i2)
+{
+#undef DIMS
+#define DIMS [1][1]
+
+  T (char, DIMS, [0]);
+  T (char, DIMS, [1]);
+  T (char, DIMS, [2]);// { dg-warning "subscript 2 is above array bounds of 'char\\\[1]\\\[1]'" }
+
+  T (char, DIMS, [0][0]);
+  T (char, DIMS, [0][1]);
+  T (char, DIMS, [0][2]); // { dg-warning "subscript 2 is above array bounds of 'char\\\[1]'" }
+
+  T (char, DIMS, [1][0]); // { dg-warning "subscript 1 is above array bounds of 'char\\\[1]\\\[1]'" }
+  T (char, DIMS, [1][1]); // { dg-warning "subscript 1 is above array bounds of 'char\\\[1]\\\[1]'" }
+  T (char, DIMS, [1][2]); // { dg-warning "subscript 2 is above array bounds of 'char\\\[1]'" }
+
+  // Exercise ranges.
+  if (i0 < 0) i0 = 0;
+  if (i1 < 1) i1 = 1;
+  if (i2 < 2) i2 = 2;
+
+  T (char, DIMS, [i0]);
+  T (char, DIMS, [i1]);
+  T (char, DIMS, [i2]);   // { dg-warning "subscript 2 is above array bounds of 'char\\\[1]\\\[1]" }
+
+  T (char, DIMS, [i0][i0]);
+  T (char, DIMS, [i0][i1]);
+  T (char, DIMS, [i1][i0]);   // { dg-warning "subscript 1 is above array bounds of 'char\\\[1]\\\[1]'" }
+  T (char, DIMS, [i1][i1]);   // { dg-warning "subscript 1 is above array bounds of 'char\\\[1]\\\[1]'" }
+  T (char, DIMS, [i1][i2]);   // { dg-warning "subscript 2 is above array bounds of 'char\\\[1]'" }
+}
+
+
+void test_int_3_5 (int i0, int i1, int i2, int i3, int i4, int i5, int i6)
+{
+#undef DIMS
+#define DIMS [3][5]
+
+  T (int, DIMS, [0]);
+  T (int, DIMS, [3]);
+  T (int, DIMS, [4]); // { dg-warning "subscript 4 is above array bounds of 'int\\\[3]\\\[5]'" }
+
+  T (int, DIMS, [0][0]);
+  T (int, DIMS, [0][5]);
+  T (int, DIMS, [0][6]);  // { dg-warning "subscript 6 is above array bounds of 'int\\\[5]'" }
+
+  T (int, DIMS, [1][0]);
+  T (int, DIMS, [1][5]);
+  T (int, DIMS, [1][6]);  // { dg-warning "subscript 6 is above array bounds of 'int\\\[5]'" }
+
+  T (int, DIMS, [3][0]);  // { dg-warning "subscript 3 is above array bounds of 'int\\\[3]\\\[5]'" }
+  T (int, DIMS, [3][5]);  // { dg-warning "subscript 3 is above array bounds of 'int\\\[3]\\\[5]'" }
+  T (int, DIMS, [3][6]);  // { dg-warning "subscript 6 is above array bounds of 'int\\\[5]'" }
+
+  // Exercise ranges.
+  if (i0 < 0) i0 = 0;
+  if (i1 < 1) i1 = 1;
+  if (i2 < 2) i2 = 2;
+  if (i3 < 3) i3 = 3;
+  if (i4 < 4) i4 = 4;
+  if (i5 < 5) i5 = 5;
+  if (i6 < 6) i6 = 6;
+
+  T (int, DIMS, [i0]);
+  T (int, DIMS, [i3]);
+  T (int, DIMS, [i4]);// { dg-warning "subscript 4 is above array bounds of 'int\\\[3]\\\[5]" }
+
+  T (int, DIMS, [i0][i0]);
+  T (int, DIMS, [i0][i5]);
+  T (int, 

Re: [PATCH PR95696] regrename creates overlapping register allocations for vliw

2020-07-22 Thread Richard Sandiford
Zhongyunde  writes:
>> -Original Message-
>> From: Richard Sandiford [mailto:richard.sandif...@arm.com]
>> Sent: Wednesday, July 22, 2020 12:12 AM
>> To: Zhongyunde 
>> Cc: gcc-patches@gcc.gnu.org; Yangfei (A) 
>> Subject: Re: 答复: [PATCH PR95696] regrename creates overlapping
>> register allocations for vliw
>> 
>> Zhongyunde  writes:
>> > Thanks for your review.
>> >
>> > First of all, this is an optimization.
>> 
>> OK, good.
>> 
>> >gcc do sms before reload, and here each insn use pseudo-register.
>> After reload, they are allocated hard-register, then the regrename pass try
>> to adjust the register number with def/use chain created by
>> build_def_use.
>> >  As now gcc doesn't consider the VLIW bundles, so regrename pass may
>> updated a reg which may not really unused, which will bring in invalid
>> VLIW bundles.
>> >Before the final schedule, we usually recheck the validation of VLIW
>> bundles, and reschedule the conflicted insns into two VLIW to make them
>> validation to avoid above issue, so this is not a correctness issue.
>> >  Certainly, reschedule the conflicted insns into two VLIW will destroy
>> the kernel loop's sms schedule result, and usually it will be harmful to the
>> performance.
>> 
>> Yeah.  The reason I was worried about the TI markers being stale is that, in
>> general, register allocation can introduce new spills and reloads, can add
>> and remove instructions, and can convert instructions into different forms
>> (e.g. as a result of register elimination).
>> There are then post-reload optimisers that can change the code further.
>> All these things could invalidate the VLIW bundling done by the first
>> scheduler.
>> 
>> It sounds like that's not happening in your motivating testcase, and the
>> VLIW bundling is still correct (for this loop) by the time that regrename
>> runs.  Is that right?
>
> Yes, it is right.
>
>> It's interesting that this is for a testcase using SMS.  One of the 
>> traditional
>> problems with the GCC implementation of SMS has been ensuring that
>> later passes don't mess up the scheduled loop.  So in your testcase, does
>> register allocation succeed for the SMS loop without invalidating the
>> bundling decisions?
>
> Yes.
>
>> If so, then it's probably better to avoid running regrename on it at all.
>> It mostly exists to help the second scheduling pass, but the second
>> scheduling pass shouldn't be messing with an SMS loop anyway.  Also,
>> although the patch deals with one case in which regrename could disrupt
>> the bundling, there are others too.
>> 
>> So maybe one option would be to make regrename ignore blocks that
>> have BB_DISABLE_SCHEDULE set.  (Sorry if that's been discussed and
>> discounted
>> already.)
>
> ok, according your advice, I make a new patch attached.

Thanks.  I think we should treat the SMS and the REG_UNUSED stuff as
separate patches though.

For the SMS part, I think a better place to enforce the rule
is in build_def_use.  If that function returns false early for
BB_DISABLE_SCHEDULE, we will avoid disrupting the schedule for the
block without wasting too much compile time on it, and we'll still
keep the pass structures internally correct.  (It would also be good
to have a dump_file message to say that that's what we're doing.)

Do you still need the REG_UNUSED stuff with the SMS patch?  If so,
could you describe the (presumably non-SMS) cases that are affected?

TBH, given that the bundling information is so uncertain at this stage,
I think it would be better to have a mode in which regrename ignores
REG_UNUSED notes altogether.  Perhaps we could put it under a --param,
which targets could then set to whichever default they prefer.
The default should be the current behaviour though.

Thanks,
Richard


Re: [PATCH v3] dse: Remove partial load after full store for high part access[PR71309]

2020-07-22 Thread Richard Sandiford
Richard Sandiford  writes:
> luoxhu  writes:
>> Hi,
>>
>> On 2020/7/22 19:05, Richard Sandiford wrote:
>>> This wasn't really what I meant.  Using subregs is fine, but I was
>>> thinking of:
>>> 
>>>/* Also try a wider mode if the necessary punning is either not
>>>  desirable or not possible.  */
>>>if (!CONSTANT_P (store_info->rhs)
>>>   && !targetm.modes_tieable_p (new_mode, store_mode))
>>> continue;
>>> 
>>>if (multiple_p (shift, GET_MODE_BITSIZE (new_mode)))
>>> {
>>>   /* Try to implement the shift using a subreg.  */
>>>   poly_int64 offset = subreg_offset_from_lsb (new_mode, store_mode,
>>>   shift);
>>>   rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
>>> store_mode, offset);
>>>   if (rhs_subreg)
>>> {
>>>   ...
>>>   break;
>>> }
>>> }
>>> 
>>> where the rhs_subreg is from your original patch.
>>> 
>>> The multiple_p should be that way round: the shift needs to be a
>>> multiple of the new_mode for the subreg to be valid.
>>> 
>>> I think this should also avoid the BITS_PER_WORD problem.  On the
>>> other hand, I agree BITS_PER_UNIT isn't a very sensible limit if
>>> we're using subregs, so maybe moving it to after the multiple_p
>>> if block would still make sense.
>>> 
>>
>> Thanks, I took that rhs_subreg part back for the v3 patch and updated a bit
>> based on your prototype, shift should be put in op1 as multiple_p requires
>> op0 >= op1. 
>>
>> Then, new_mode is still TImode same to store_mode, offset will return 8 when
>> shift is 64,  simplify_gen_subreg needs an additional inner_mode(DImode) 
>> generated from "smallest_int_mode_for_size (shift)" to get rhs_subreg, 
>> otherwise it will return NULL if new_mode is equal to store_mode.
>>
>> Lastly, move the BITS_PER_UNIT after multiple_p as it still need generate
>> shift_seq for other circumstances. :)
>
> I don't understand why my version doesn't work though.  The point
> is that we're using the containing:
>
>   FOR_EACH_MODE_FROM (new_mode_iter,
> smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
>
> to find a suitable mode.  In the existing code it's searching for a mode
> that is suitable for the shift.  In the new code it's finding a mode that
> is suitable for the outer mode of the subreg (hence using new_mode as the
> first argument to simplify_gen_subreg above).  It shouldn't be necessary
> to use smallest_int_mode_for_size to find a different mode.

I now realise the reason is that the starting mode is too wide.
I think we should fix that by doing:

  FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)
{
  …

and then add:

  if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
continue;

after your optimisation, so that the shift code proper still only
considers modes that are wide enough to hold the unshifted value.

I don't think there are any efficiency concerns with that, since
smallest_int_mode_for_size does its own similar iteration internally.

Sorry for not picking up on that first time.

Thanks,
Richard


Re: [PATCH] Treat { 0 } specially for structs with the designated_init attribute.

2020-07-22 Thread Asher Gordon via Gcc-patches
Hello Joseph, Martin,

Asher Gordon  writes:

> Joseph Myers  writes:
>
>> I don't see you in the FSF copyright assignment list; could you
>> complete
>> https://git.savannah.gnu.org/cgit/gnulib.git/plain/doc/Copyright/request-assign.future
>> (unless you're already covered by an employer assignment)?
>
> Done.

My copyright assignment finally got finished, so you should be able to
apply my patches now.

For your convenience, I have attached the three patches below:
From e94073b90d5906dc4eb14ebfec4ea24ae1241184 Mon Sep 17 00:00:00 2001
From: Asher Gordon 
Date: Mon, 8 Jun 2020 20:59:38 -0400
Subject: [PATCH 1/3] Replace free with XDELETE.

gcc/c/ChangeLog:

	* c-typeck.c (free_all_tagged_tu_seen_up_to): Replace free
	with XDELETE.
	(finish_init): Likewise.
	(pop_init_level): Likewise.
---
 gcc/c/c-typeck.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c
index fb5c288b549..44f2722adb8 100644
--- a/gcc/c/c-typeck.c
+++ b/gcc/c/c-typeck.c
@@ -1407,7 +1407,7 @@ free_all_tagged_tu_seen_up_to (const struct tagged_tu_seen_cache *tu_til)
   const struct tagged_tu_seen_cache *const tu1
 	= (const struct tagged_tu_seen_cache *) tu;
   tu = tu1->next;
-  free (CONST_CAST (struct tagged_tu_seen_cache *, tu1));
+  XDELETE (CONST_CAST (struct tagged_tu_seen_cache *, tu1));
 }
   tagged_tu_seen_base = tu_til;
 }
@@ -8314,13 +8314,13 @@ finish_init (void)
 {
   struct constructor_stack *q = constructor_stack;
   constructor_stack = q->next;
-  free (q);
+  XDELETE (q);
 }
 
   gcc_assert (!constructor_range_stack);
 
   /* Pop back to the data of the outer initializer (if any).  */
-  free (spelling_base);
+  XDELETE (spelling_base);
 
   constructor_decl = p->decl;
   require_constant_value = p->require_constant_value;
@@ -8333,7 +8333,7 @@ finish_init (void)
   spelling_size = p->spelling_size;
   constructor_top_level = p->top_level;
   initializer_stack = p->next;
-  free (p);
+  XDELETE (p);
 }

 /* Call here when we see the initializer is surrounded by braces.
@@ -8864,7 +8864,7 @@ pop_init_level (location_t loc, int implicit,
   RESTORE_SPELLING_DEPTH (constructor_depth);
 
   constructor_stack = p->next;
-  free (p);
+  XDELETE (p);
 
   if (ret.value == NULL_TREE && constructor_stack == 0)
 ret.value = error_mark_node;
-- 
2.27.0

From b69c8aec5639655bc87ded65d75041a7e1d7c14f Mon Sep 17 00:00:00 2001
From: Asher Gordon 
Date: Wed, 3 Jun 2020 17:20:08 -0400
Subject: [PATCH 2/3] PR c/95379 Treat { 0 } specially for -Wdesignated-init.

gcc/c/ChangeLog:

	PR c/95379
	* c-typeck.c (warning_init): Allow variable arguments.
	(struct positional_init_info): New type.
	(struct initializer_stack): Add positional_init_info for
	-Wdesignated-init warnings.
	(start_init): Initialize
	initializer_stack->positional_init_info.
	(finish_init): Free initializer_stack->positional_init_info.
	(pop_init_level): Move -Wdesignated-init warning here from
	process_init_element so that we can treat { 0 } specially.
	(process_init_element): Instead of warning on
	-Wdesignated-init here, remember a list of locations and
	fields where we should warn, and do the actual warning in
	pop_init_level.

gcc/ChangeLog:

	PR c/95379
	* doc/extend.texi: Document { 0 } as a special case for the
	designated_init attribute.

gcc/testsuite/ChangeLog:

	PR c/95379
	* gcc.dg/Wdesignated-init-3.c: New test.
---
 gcc/c/c-typeck.c  | 69 ---
 gcc/doc/extend.texi   |  4 ++
 gcc/testsuite/gcc.dg/Wdesignated-init-3.c | 12 
 3 files changed, 77 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/Wdesignated-init-3.c

diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c
index 44f2722adb8..1afec3fdc36 100644
--- a/gcc/c/c-typeck.c
+++ b/gcc/c/c-typeck.c
@@ -107,7 +107,7 @@ static void push_string (const char *);
 static void push_member_name (tree);
 static int spelling_length (void);
 static char *print_spelling (char *);
-static void warning_init (location_t, int, const char *);
+static void warning_init (location_t, int, const char *, ...);
 static tree digest_init (location_t, tree, tree, tree, bool, bool, int);
 static void output_init_element (location_t, tree, tree, bool, tree, tree, bool,
  bool, struct obstack *);
@@ -6431,11 +6431,12 @@ pedwarn_init (location_t loc, int opt, const char *gmsgid, ...)
controls this warning.  GMSGID identifies the message.  The
component name is taken from the spelling stack.  */
 
-static void
-warning_init (location_t loc, int opt, const char *gmsgid)
+static void ATTRIBUTE_GCC_DIAG (3,4)
+warning_init (location_t loc, int opt, const char *gmsgid, ...)
 {
   char *ofwhat;
   bool warned;
+  va_list ap;
 
   auto_diagnostic_group d;
 
@@ -6445,7 +6446,9 @@ warning_init (location_t loc, int opt, const char *gmsgid)
   location_t exploc = expansion_point_location_if_in_system_header (loc);
 
   /* The gmsgid may be a 

RE: [PATCH] middle-end: Fold popcount(x&4) to (x>>2)&1 and friends.

2020-07-22 Thread Roger Sayle

Hi Richard,

Many thanks for the peer review and feedback.  I completely agree that POPCOUNT
and PARITY iterators simplifies things and handle the IFN_ variants.  Likewise, 
using
integer_type_node as the type of shift constants also matches the idioms used 
elsewhere in match.pd and fold.  The following (combined) patch implements those
suggestions, for both submitted patches and the existing POPCOUNT 
simplifications,
cleaning up this logic and making this part of match.pd more consistent.
[I hadn't appreciated using POPCOUNT/PARITY avoids the need for an explicit 
for].

I've kept the shiftrt unsigned which is both a requirement for the 
transformation (when
the single bit is the sign bit), but this also matches the (canonicalization) 
preference in
the middle-end that unsigned logical shifts are preferred over arithmetic 
shifts when
the distinction isn't important [lshrdi3 is sometimes cheaper than ashrdi3].

This revised patch has been tested on x86_64-pc-linux-gnu with a "make 
bootstrap"
and "make -k check" with no new failures.
Ok for mainline?


2020-07-22  Roger Sayle  
Richard Biener  

gcc/ChangeLog
* match.pd (popcount(x)&1 -> parity(x)): New simplification.
(parity(~x) -> parity(x)): New simplification.
(parity(x)^parity(y) -> parity(x^y)): New simplification.
(parity(x&1) -> x&1): New simplification.
(popcount(x) -> x>>C): New simplification.

gcc/testsuite/ChangeLog
* gcc.dg/fold-popcount-5.c: New test.
* gcc.dg/fold-parity-1.c: Likewise.
* gcc.dg/fold-parity-2.c: Likewise.
* gcc.dg/fold-parity-3.c: Likewise.
* gcc.dg/fold-parity-4.c: Likewise.
* gcc.dg/fold-parity-5.c: Likewise.


Thanks in advance,
Roger
--
Roger Sayle
NextMove Software
Cambridge, UK

-Original Message-
From: Richard Biener  
Sent: 20 July 2020 14:40
To: Roger Sayle 
Cc: GCC Patches 
Subject: Re: [PATCH] middle-end: Fold popcount(x&4) to (x>>2)&1 and friends.

On Mon, Jul 20, 2020 at 3:06 PM Roger Sayle  wrote:
> This patch complements one from June 12th which is still awaiting review: 
> https://gcc.gnu.org/pipermail/gcc-patches/2020-June/547937.html
>
> This patch optimizes popcount and parity of an argument known to have 
> at most a single bit set, to be that single bit.  Hence, popcount(x&8)
> is simplified to (x>>3)&1.   This generalizes the existing optimization
> of popcount(x&1) being simplified to x&1, which is moved with this 
> patch to avoid a duplicate pattern warning in match.pd.
>
> This patch has been tested on x86_64-pc-linux-gnu with a "make bootstrap"
> and "make -k check" with no new failures.  If this is approved after 
> (or at the same time) as the patch above, I'm happy to resolve the 
> conflicts and retest before committing.

Given you know the constant bit position of the possibly nonzero bit you can 
elide the conversion to unsigned for all but the case of a possibly negative 
input (IIRC GCC doesn't yet take advantage of negative right shift 
undefinedness - but maybe sanitizers complain).
Also the shift amount doesn't need to be in the same type as the shifted amount 
so using either size_int() or integer_type_node for that argument should reduce 
INTEGER_CST waste.

Any reason you are not tackling IFN_POPCOUNT/PARITY?
You could use

(for pfun (POPCOUNT PARITY)
 ...

and automagically get all builtins and the IFN.

Thanks,
Richard.

diff --git a/gcc/match.pd b/gcc/match.pd
index c6ae7a7..a096a17 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5964,25 +5964,55 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(IFN_FMA @0 @1 @2
 
 /* POPCOUNT simplifications.  */
-(for popcount (BUILT_IN_POPCOUNT BUILT_IN_POPCOUNTL BUILT_IN_POPCOUNTLL
-  BUILT_IN_POPCOUNTIMAX)
-  /* popcount(X&1) is nop_expr(X&1).  */
-  (simplify
-(popcount @0)
-(if (tree_nonzero_bits (@0) == 1)
-  (convert @0)))
-  /* popcount(X) + popcount(Y) is popcount(X|Y) when X must be zero.  */
-  (simplify
-(plus (popcount:s @0) (popcount:s @1))
-(if (wi::bit_and (tree_nonzero_bits (@0), tree_nonzero_bits (@1)) == 0)
-  (popcount (bit_ior @0 @1
-  /* popcount(X) == 0 is X == 0, and related (in)equalities.  */
+/* popcount(X) + popcount(Y) is popcount(X|Y) when X must be zero.  */
+(simplify
+  (plus (POPCOUNT:s @0) (POPCOUNT:s @1))
+  (if (wi::bit_and (tree_nonzero_bits (@0), tree_nonzero_bits (@1)) == 0)
+(POPCOUNT (bit_ior @0 @1
+
+/* popcount(X) == 0 is X == 0, and related (in)equalities.  */
+(for popcount (POPCOUNT)
   (for cmp (le eq ne gt)
rep (eq eq ne ne)
 (simplify
   (cmp (popcount @0) integer_zerop)
   (rep @0 { build_zero_cst (TREE_TYPE (@0)); }
 
+/* Canonicalize POPCOUNT(x)&1 as PARITY(X).  */
+(for popcount (BUILT_IN_POPCOUNT BUILT_IN_POPCOUNTL BUILT_IN_POPCOUNTLL
+  BUILT_IN_POPCOUNTIMAX)
+ parity (BUILT_IN_PARITY BUILT_IN_PARITYL BUILT_IN_PARITYLL
+BUILT_IN_PARITYIMAX)
+  (simplify
+(bit_and (popcount @0) 

Re: [PATCH] rs6000: __builtin_mma_disassemble_acc() doesn't store elements correctly in LE mode

2020-07-22 Thread Peter Bergner via Gcc-patches
On 7/22/20 1:00 PM, Segher Boessenkool wrote:
>> gcc/
>>  PR target/96236
>>  * config/rs6000/rs6000-call.c (rs6000_gimple_fold_mma_builtin): Handle
>>  little-endian memory ordering.
>> 
>> gcc/testsuite/
>>  PR target/96236
>>  * gcc.target/powerpc/mma-double-test.c: Update storing results for
>>  correct little-endian ordering.
>>  * gcc.target/powerpc/mma-single-test.c: Likewise.
> 
> Okay for trunk.  It's not going to benefit from any soak-in time other
> than what you have tested already, so it is fine for 10 immediately as
> well.  Thanks!

Ok, pushed to trunk.  I'll wait for the branch to reopen before pushing it
there too.  Thanks!

Peter




Re: [PATCH 2/2] Aarch64: Add branch diluter pass

2020-07-22 Thread Andrea Corallo
Segher Boessenkool  writes:

> Hi!
>
> On Wed, Jul 22, 2020 at 03:53:34PM +0200, Andrea Corallo wrote:
>> Andrew Pinski  writes:
>> > Can you give a simple example of what this patch does?
>>
>> Sure, this pass simply moves a sliding window over the insns trying to
>> make sure that we never have more then 'max_branch' branches for every
>> 'granule_size' insns.
>>
>> If too many branches are detected nops are added where considered less
>> armful to correct that.
>
> Should that actually be a sliding window, or should there actually just
> not be more than N branches per aligned block of machine code?  Like,
> per fetch group.
>
> Can you not use ASM_OUTPUT_ALIGN_WITH_NOP (or ASM_OUTPUT_MAX_SKIP_ALIGN
> even) then?  GCC has infrastructure for that, already.

Correct, it's a sliding window only because the real load address is not
known to the compiler and the algorithm is conservative.  I believe we
could use ASM_OUTPUT_ALIGN_WITH_NOP if we align each function to (al
least) the granule size, then we should be able to insert 'nop aligned
labels' precisely.

My main fear is that given new cores tend to have big granules code size
would blow.  One advantage of the implemented algorithm is that even if
slightly conservative it's impacting code size only where an high branch
density shows up.

  Andrea


[committed] libstdc++: Add static assertions to futures and promises [LWG 3458]

2020-07-22 Thread Jonathan Wakely via Gcc-patches
LWG recently decided it should be ill-formed to instantiate std::future
and std::shared_future for types that can't be returned from a function.
This adds static assertions to enforce it (std::future already failed,
but this makes the error more understandable).

LWG 3466 extends that to std::promise. The actual constraint is that
t.~T() is well-formed for the primary template, but rejecting arrays and
functions as done for futures matches that condition.

libstdc++-v3/ChangeLog:

* include/std/future (future, shared_future, promise): Add
static assertions to the primary template to reject array and
function types.
* testsuite/30_threads/future/requirements/lwg3458.cc: New test.
* testsuite/30_threads/promise/requirements/lwg3466.cc: New test.
* testsuite/30_threads/shared_future/requirements/lwg3458.cc: New test.

Tested powerpc64le-linux, committed to trunk.

commit 1f53367fb5f16985e82c39f56368b956292cf86c
Author: Jonathan Wakely 
Date:   Wed Jul 22 20:10:38 2020

libstdc++: Add static assertions to futures and promises [LWG 3458]

LWG recently decided it should be ill-formed to instantiate std::future
and std::shared_future for types that can't be returned from a function.
This adds static assertions to enforce it (std::future already failed,
but this makes the error more understandable).

LWG 3466 extends that to std::promise. The actual constraint is that
t.~T() is well-formed for the primary template, but rejecting arrays and
functions as done for futures matches that condition.

libstdc++-v3/ChangeLog:

* include/std/future (future, shared_future, promise): Add
static assertions to the primary template to reject array and
function types.
* testsuite/30_threads/future/requirements/lwg3458.cc: New test.
* testsuite/30_threads/promise/requirements/lwg3466.cc: New test.
* testsuite/30_threads/shared_future/requirements/lwg3458.cc: New 
test.

diff --git a/libstdc++-v3/include/std/future b/libstdc++-v3/include/std/future
index 6eef6864f4d..bdf4a75d694 100644
--- a/libstdc++-v3/include/std/future
+++ b/libstdc++-v3/include/std/future
@@ -763,6 +763,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   template
 class future : public __basic_future<_Res>
 {
+  // _GLIBCXX_RESOLVE_LIB_DEFECTS
+  // 3458. Is shared_future intended to work with arrays or function types?
+  static_assert(!is_array<_Res>{}, "result type is not an array");
+  static_assert(!is_function<_Res>{}, "result type is not a function");
+
   friend class promise<_Res>;
   template friend class packaged_task;
   template
@@ -893,6 +898,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   template
 class shared_future : public __basic_future<_Res>
 {
+  // _GLIBCXX_RESOLVE_LIB_DEFECTS
+  // 3458. Is shared_future intended to work with arrays or function types?
+  static_assert(!is_array<_Res>{}, "result type is not an array");
+  static_assert(!is_function<_Res>{}, "result type is not a function");
+
   typedef __basic_future<_Res> _Base_type;
 
 public:
@@ -1045,6 +1055,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   template
 class promise
 {
+  // _GLIBCXX_RESOLVE_LIB_DEFECTS
+  // 3466: Specify the requirements for promise/future/[...] consistently
+  static_assert(!is_array<_Res>{}, "result type is not an array");
+  static_assert(!is_function<_Res>{}, "result type is not a function");
+
   typedef __future_base::_State_base   _State;
   typedef __future_base::_Result<_Res> _Res_type;
   typedef __future_base::_Ptr<_Res_type>   _Ptr_type;
diff --git a/libstdc++-v3/testsuite/30_threads/future/requirements/lwg3458.cc 
b/libstdc++-v3/testsuite/30_threads/future/requirements/lwg3458.cc
new file mode 100644
index 000..2bc206c9450
--- /dev/null
+++ b/libstdc++-v3/testsuite/30_threads/future/requirements/lwg3458.cc
@@ -0,0 +1,34 @@
+// Copyright (C) 2020 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// .
+
+// { dg-do compile { target c++11 } }
+
+// LWG 3458
+// Is shared_future intended to work with arrays or function types?
+
+#include 
+
+std::future good;
+std::future good2;
+
+std::future 

c++: Name as_base type

2020-07-22 Thread Nathan Sidwell
The as-base type never got a name.  For modules I needed to give it a 
name to serialize properly, and it's useful when debugging the compiler, 
so we may as well have it on trunk.  There's also a bug where its fields 
can have NSDMIs from the main class.  This happens to be silent on 
trunk, but can be a GC leak where we retain a deferred parse node there. 
(On modules it blows up, because we're not prepared to serialize 
deferred parse nodes, as they should never survive parsing.


gcc/cp/
* cp-tree.h (enum cp_tree_index): Add CPTI_AS_BASE_IDENTIFIER.
(as_base_identifier): Define.
* decl.c (initialize_predifined_identifiers): Initialize 
as_base

identifier.
* class.c (layout_class_type): Name the as-base type.  Zap
NSDMI its fields may have.

--
Nathan Sidwell
diff --git i/gcc/cp/class.c w/gcc/cp/class.c
index a3913f4ce0b..ba96113bc88 100644
--- i/gcc/cp/class.c
+++ w/gcc/cp/class.c
@@ -6715,6 +6715,10 @@ layout_class_type (tree t, tree *virtuals_p)
   /* T needs a different layout as a base (eliding virtual bases
 	 or whatever).  Create that version.  */
   tree base_t = make_node (TREE_CODE (t));
+  tree base_d = create_implicit_typedef (as_base_identifier, base_t);
+
+  TYPE_CONTEXT (base_t) = t;
+  DECL_CONTEXT (base_d) = t;
 
   /* If the ABI version is not at least two, and the last
 	 field was a bit-field, RLI may not be on a byte
@@ -6751,6 +6755,9 @@ layout_class_type (tree t, tree *virtuals_p)
 	if (TREE_CODE (field) == FIELD_DECL)
 	  {
 	*next_field = copy_node (field);
+	/* Zap any NSDMI, it's not needed and might be a deferred
+	   parse.  */
+	DECL_INITIAL (*next_field) = NULL_TREE;
 	DECL_CONTEXT (*next_field) = base_t;
 	next_field = _CHAIN (*next_field);
 	  }
@@ -6760,8 +6767,6 @@ layout_class_type (tree t, tree *virtuals_p)
 	 needs a mode.  */
   compute_record_mode (base_t);
 
-  TYPE_CONTEXT (base_t) = t;
-
   /* Record the base version of the type.  */
   CLASSTYPE_AS_BASE (t) = base_t;
 }
diff --git i/gcc/cp/cp-tree.h w/gcc/cp/cp-tree.h
index d43c53ae29a..2377fc052bb 100644
--- i/gcc/cp/cp-tree.h
+++ w/gcc/cp/cp-tree.h
@@ -148,6 +148,7 @@ enum cp_tree_index
 CPTI_DELTA_IDENTIFIER,
 CPTI_IN_CHARGE_IDENTIFIER,
 CPTI_VTT_PARM_IDENTIFIER,
+CPTI_AS_BASE_IDENTIFIER,
 CPTI_THIS_IDENTIFIER,
 CPTI_PFN_IDENTIFIER,
 CPTI_VPTR_IDENTIFIER,
@@ -289,6 +290,7 @@ extern GTY(()) tree cp_global_trees[CPTI_MAX];
 /* The name of the parameter that contains a pointer to the VTT to use
for this subobject constructor or destructor.  */
 #define vtt_parm_identifier		cp_global_trees[CPTI_VTT_PARM_IDENTIFIER]
+#define as_base_identifier		cp_global_trees[CPTI_AS_BASE_IDENTIFIER]
 #define this_identifier			cp_global_trees[CPTI_THIS_IDENTIFIER]
 #define pfn_identifier			cp_global_trees[CPTI_PFN_IDENTIFIER]
 #define vptr_identifier			cp_global_trees[CPTI_VPTR_IDENTIFIER]
diff --git i/gcc/cp/decl.c w/gcc/cp/decl.c
index de53a7b8b73..385b1f3a0c4 100644
--- i/gcc/cp/decl.c
+++ w/gcc/cp/decl.c
@@ -4202,6 +4202,7 @@ initialize_predefined_identifiers (void)
 {"__dt_del ", _dtor_identifier, cik_dtor},
 {"__conv_op ", _op_identifier, cik_conv_op},
 {"__in_chrg", _charge_identifier, cik_normal},
+{"__as_base ", _base_identifier, cik_normal},
 {"this", _identifier, cik_normal},
 {"__delta", _identifier, cik_normal},
 {"__pfn", _identifier, cik_normal},


Re: [PATCH 1/2] Add new RTX instruction class FILLER_INSN

2020-07-22 Thread Joseph Myers
New insn types should be documented in rtl.texi (I think in the "Insns" 
section).

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: [PATCH PR96053] Add "#pragma GCC no_reduc_chain"

2020-07-22 Thread Joseph Myers
A new pragma needs to be documented in extend.texi.  Such documentation 
should be comprehensible to users who don't know anything about the 
internals of GCC or other compilers, so that they can understand when it 
would be appropriate to use the pragma in their source code.

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: [Patch] OpenMP: Support 'if (simd:/cancel:' in Fortran

2020-07-22 Thread Jakub Jelinek via Gcc-patches
On Wed, Jul 22, 2020 at 05:52:00PM +0200, Tobias Burnus wrote:
> gcc/fortran/ChangeLog:
> 
>   * gfortran.h (enum gfc_omp_if_kind): Add OMP_IF_CANCEL and OMP_IF_SIMD.
>   * openmp.c (OMP_SIMD_CLAUSES): Add OMP_CLAUSE_IF.
>   (gfc_match_omp_clauses, resolve_omp_clauses): Handle 'if (simd/cancel:'.
>   * dump-parse-tree.c (show_omp_clauses): Likewise.
>   * trans-openmp.c (gfc_trans_omp_clauses, gfc_trans_omp_cancel,
>   (gfc_split_omp_clauses): Likewise.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gfortran.dg/gomp/cancel-2.f90: New test.
>   * gfortran.dg/gomp/cancel-3.f90: New test.
>   * gfortran.dg/gomp/if-1.f90: New test.

Ok, thanks.

Jakub



Re: [PATCH] rs6000: __builtin_mma_disassemble_acc() doesn't store elements correctly in LE mode

2020-07-22 Thread Segher Boessenkool
Hi Peter,

On Wed, Jul 22, 2020 at 12:01:21PM -0500, Peter Bergner wrote:
> PR96236 shows a problem where we don't correctly store our 512-bit 
> accumulators
> correctly in little-endian mode.  The patch below detects when we're doing a
> little-endian memory access and stores to the correct memory locations.

> gcc/
>   PR target/96236
>   * config/rs6000/rs6000-call.c (rs6000_gimple_fold_mma_builtin): Handle
>   little-endian memory ordering.
> 
> gcc/testsuite/
>   PR target/96236
>   * gcc.target/powerpc/mma-double-test.c: Update storing results for
>   correct little-endian ordering.
>   * gcc.target/powerpc/mma-single-test.c: Likewise.

Okay for trunk.  It's not going to benefit from any soak-in time other
than what you have tested already, so it is fine for 10 immediately as
well.  Thanks!


Segher


Re: [PATCH v2] vect/rs6000: Support vector with length cost modeling

2020-07-22 Thread Segher Boessenkool
Hi!

On Wed, Jul 22, 2020 at 09:26:39AM +0800, Kewen.Lin wrote:
> +/* For some target specific vectorization cost which can't be handled per 
> stmt,
> +   we check the requisite conditions and adjust the vectorization cost
> +   accordingly if satisfied.  One typical example is to model shift cost for
> +   vector with length by counting number of required lengths under condition
> +   LOOP_VINFO_FULLY_WITH_LENGTH_P.  */
> +
> +static void
> +adjust_vect_cost (rs6000_cost_data *data)
> +{

Maybe call it rs6000_adjust_vect_cost?  For consistency, but also it
could (in the future) collide with a globalfunction of the same name (it
is a very non-specific name).

> +   /* Each length needs one shift to fill into bits 0-7.  */
> +   shift_cnt += (num_vectors_m1 + 1);

That doesn't need parentheses.

>if (cost_data->loop_info)
> -rs6000_density_test (cost_data);
> +{
> +  adjust_vect_cost (cost_data);
> +  rs6000_density_test (cost_data);
> +}

^^^ consistency :-)

The rs6000 parts are fine for trunk, thanks!


Segher


Re: [PATCH v3] dse: Remove partial load after full store for high part access[PR71309]

2020-07-22 Thread Richard Sandiford
luoxhu  writes:
> Hi,
>
> On 2020/7/22 19:05, Richard Sandiford wrote:
>> This wasn't really what I meant.  Using subregs is fine, but I was
>> thinking of:
>> 
>>/* Also try a wider mode if the necessary punning is either not
>>   desirable or not possible.  */
>>if (!CONSTANT_P (store_info->rhs)
>>&& !targetm.modes_tieable_p (new_mode, store_mode))
>>  continue;
>> 
>>if (multiple_p (shift, GET_MODE_BITSIZE (new_mode)))
>>  {
>>/* Try to implement the shift using a subreg.  */
>>poly_int64 offset = subreg_offset_from_lsb (new_mode, store_mode,
>>shift);
>>rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
>>  store_mode, offset);
>>if (rhs_subreg)
>>  {
>>...
>>break;
>>  }
>>  }
>> 
>> where the rhs_subreg is from your original patch.
>> 
>> The multiple_p should be that way round: the shift needs to be a
>> multiple of the new_mode for the subreg to be valid.
>> 
>> I think this should also avoid the BITS_PER_WORD problem.  On the
>> other hand, I agree BITS_PER_UNIT isn't a very sensible limit if
>> we're using subregs, so maybe moving it to after the multiple_p
>> if block would still make sense.
>> 
>
> Thanks, I took that rhs_subreg part back for the v3 patch and updated a bit
> based on your prototype, shift should be put in op1 as multiple_p requires
> op0 >= op1. 
>
> Then, new_mode is still TImode same to store_mode, offset will return 8 when
> shift is 64,  simplify_gen_subreg needs an additional inner_mode(DImode) 
> generated from "smallest_int_mode_for_size (shift)" to get rhs_subreg, 
> otherwise it will return NULL if new_mode is equal to store_mode.
>
> Lastly, move the BITS_PER_UNIT after multiple_p as it still need generate
> shift_seq for other circumstances. :)

I don't understand why my version doesn't work though.  The point
is that we're using the containing:

  FOR_EACH_MODE_FROM (new_mode_iter,
  smallest_int_mode_for_size (access_size * BITS_PER_UNIT))

to find a suitable mode.  In the existing code it's searching for a mode
that is suitable for the shift.  In the new code it's finding a mode that
is suitable for the outer mode of the subreg (hence using new_mode as the
first argument to simplify_gen_subreg above).  It shouldn't be necessary
to use smallest_int_mode_for_size to find a different mode.

That's also why the multiple_p is the way round it is above.  The idea
is that the shift amount must be a multiple of the size of the outer mode
(here new_mode) in order for the subreg to be valid.

So in your example, the loop should be finding new_mode == DImode,
seeing that the shift amount of 64 is a multiple of the size of DImode,
and trying to convert that shift anount into a DImode subreg of the
TImode value.

Thanks,
Richard

> [PATCH v3] dse: Remove partial load after full store for high part 
> access[PR71309]
>
>
> This patch could optimize (works for char/short/int/void*):
>
> 6: r119:TI=[r118:DI+0x10]
> 7: [r118:DI]=r119:TI
> 8: r121:DI=[r118:DI+0x8]
>
> =>
>
> 6: r119:TI=[r118:DI+0x10]
> 16: r122:DI=r119:TI#8
>
> Final ASM will be as below without partial load after full store(stxv+ld):
>   ld 10,16(3)
>   mr 9,3
>   ld 3,24(3)
>   std 10,0(9)
>   std 3,8(9)
>   blr
>
> It could achieve ~25% performance improvement for typical cases on
> Power9.  Bootstrap and regression tested on Power9-LE.
>
> For AArch64, one ldr is replaced by mov with this patch:
>
> ldp x2, x3, [x0, 16]
> stp x2, x3, [x0]
> ldr x0, [x0, 8]
>
> =>
>
> mov x1, x0
> ldp x2, x0, [x0, 16]
> stp x2, x0, [x1]
>
> gcc/ChangeLog:
>
> 2020-07-22  Xionghu Luo  
>
>   PR rtl-optimization/71309
>   * dse.c (find_shift_sequence): Use subreg of shifted from high part
>   register to avoid loading from address.
>
> gcc/testsuite/ChangeLog:
>
> 2020-07-22  Xionghu Luo  
>
>   PR rtl-optimization/71309
>   * gcc.target/powerpc/pr71309.c: New test.
> ---
>  gcc/dse.c  | 21 --
>  gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++
>  2 files changed, 52 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c
>
> diff --git a/gcc/dse.c b/gcc/dse.c
> index bbe792e48e8..aaa161237c3 100644
> --- a/gcc/dse.c
> +++ b/gcc/dse.c
> @@ -1736,8 +1736,6 @@ find_shift_sequence (poly_int64 access_size,
>int cost;
>  
>new_mode = new_mode_iter.require ();
> -  if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
> - break;
>  
>/* If a constant was stored into memory, try to simplify it here,
>otherwise the cost of the shift might preclude this optimization
> @@ -1779,6 +1777,25 @@ find_shift_sequence (poly_int64 access_size,
> && !targetm.modes_tieable_p (new_mode, store_mode))
>   continue;
>  
> + 

[PATCH] rs6000: __builtin_mma_disassemble_acc() doesn't store elements correctly in LE mode

2020-07-22 Thread Peter Bergner via Gcc-patches
PR96236 shows a problem where we don't correctly store our 512-bit accumulators
correctly in little-endian mode.  The patch below detects when we're doing a
little-endian memory access and stores to the correct memory locations.

This passed bootstrap and regtesting with no regressions.  Raji verified
the runnable test case changes work with a fixed compiler.

Ok for trunk and backport to the GCC 10 branch once it reopens?

Peter

gcc/
PR target/96236
* config/rs6000/rs6000-call.c (rs6000_gimple_fold_mma_builtin): Handle
little-endian memory ordering.

gcc/testsuite/
PR target/96236
* gcc.target/powerpc/mma-double-test.c: Update storing results for
correct little-endian ordering.
* gcc.target/powerpc/mma-single-test.c: Likewise.

diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c
index 5ec3f2c55ad..bb0fdf29688 100644
--- a/gcc/config/rs6000/rs6000-call.c
+++ b/gcc/config/rs6000/rs6000-call.c
@@ -11154,11 +11154,12 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator 
*gsi)
   tree src_array = build1 (VIEW_CONVERT_EXPR, array_type, src);
   for (unsigned i = 0; i < 4; i++)
{
+ unsigned index = WORDS_BIG_ENDIAN ? i : 3 - i;
  tree ref = build4 (ARRAY_REF, unsigned_V16QI_type_node, src_array,
 build_int_cst (size_type_node, i),
 NULL_TREE, NULL_TREE);
  tree dst = build2 (MEM_REF, unsigned_V16QI_type_node, dst_base,
-build_int_cst (dst_type, i * 16));
+build_int_cst (dst_type, index * 16));
  gimplify_assign (dst, ref, _seq);
}
   pop_gimplify_context (NULL);
diff --git a/gcc/testsuite/gcc.target/powerpc/mma-double-test.c 
b/gcc/testsuite/gcc.target/powerpc/mma-double-test.c
index ac84ae30004..044a288ebcc 100755
--- a/gcc/testsuite/gcc.target/powerpc/mma-double-test.c
+++ b/gcc/testsuite/gcc.target/powerpc/mma-double-test.c
@@ -12,13 +12,13 @@ typedef double v4sf_t __attribute__ ((vector_size (16)));
 #define SAVE_ACC(ACC, ldc, J)  \
  __builtin_mma_disassemble_acc (result, ACC); \
  rowC = (v4sf_t *) [0*ldc+J]; \
-  rowC[0] += result[3] ; \
+  rowC[0] += result[0]; \
   rowC = (v4sf_t *) [1*ldc+J]; \
-  rowC[0] += result[2] ; \
+  rowC[0] += result[1]; \
   rowC = (v4sf_t *) [2*ldc+J]; \
-  rowC[0] += result[1] ; \
+  rowC[0] += result[2]; \
   rowC = (v4sf_t *) [3*ldc+J]; \
- rowC[0] += result[0] ;
+ rowC[0] += result[3];
 
 void
 MMA (int m, int n, int k, double *A, double *B, double *C)
diff --git a/gcc/testsuite/gcc.target/powerpc/mma-single-test.c 
b/gcc/testsuite/gcc.target/powerpc/mma-single-test.c
index 15369a64025..7e628df45b7 100755
--- a/gcc/testsuite/gcc.target/powerpc/mma-single-test.c
+++ b/gcc/testsuite/gcc.target/powerpc/mma-single-test.c
@@ -12,24 +12,24 @@ typedef float v4sf_t __attribute__ ((vector_size (16)));
 #define SAVE_ACC(ACC, ldc,J)  \
  __builtin_mma_disassemble_acc (result, ACC); \
  rowC = (v4sf_t *) [0*ldc+J]; \
-  rowC[0] += result[3] ; \
+  rowC[0] += result[0]; \
   rowC = (v4sf_t *) [1*ldc+J]; \
-  rowC[0] += result[2] ; \
+  rowC[0] += result[1]; \
   rowC = (v4sf_t *) [2*ldc+J]; \
-  rowC[0] += result[1] ; \
+  rowC[0] += result[2]; \
   rowC = (v4sf_t *) [3*ldc+J]; \
- rowC[0] += result[0] ;
+ rowC[0] += result[3];
 
 #define SAVE_ACC1(ACC,ldc, J)  \
  __builtin_mma_disassemble_acc (result, ACC); \
  rowC = (v4sf_t *) [4* ldc+J]; \
-  rowC[0] += result[3] ; \
+  rowC[0] += result[0]; \
   rowC = (v4sf_t *) [5*ldc+J]; \
-  rowC[0] += result[2] ; \
+  rowC[0] += result[1]; \
   rowC = (v4sf_t *) [6*ldc+J]; \
-  rowC[0] += result[1] ; \
+  rowC[0] += result[2]; \
   rowC = (v4sf_t *) [7*ldc+J]; \
- rowC[0] += result[0] ;
+ rowC[0] += result[3];
 void
 MMA (int m, int n, int k, float *A, float *B, float *C)
 {


Re: [PATCH 2/2] Aarch64: Add branch diluter pass

2020-07-22 Thread Segher Boessenkool
Hi!

On Wed, Jul 22, 2020 at 03:53:34PM +0200, Andrea Corallo wrote:
> Andrew Pinski  writes:
> > Can you give a simple example of what this patch does?
> 
> Sure, this pass simply moves a sliding window over the insns trying to
> make sure that we never have more then 'max_branch' branches for every
> 'granule_size' insns.
> 
> If too many branches are detected nops are added where considered less
> armful to correct that.

Should that actually be a sliding window, or should there actually just
not be more than N branches per aligned block of machine code?  Like,
per fetch group.

Can you not use ASM_OUTPUT_ALIGN_WITH_NOP (or ASM_OUTPUT_MAX_SKIP_ALIGN
even) then?  GCC has infrastructure for that, already.


Segher


Re: [PATCH v3] vect/rs6000: Support vector with length cost modeling

2020-07-22 Thread Kewen.Lin via Gcc-patches
Hi,

Sorry, please ignore the previously attached file, which isn't the latest one
although almost are the same.   The latest tested is attached here.  

Sorry for the inconvenience.

BR,
Kewen

on 2020/7/22 下午11:48, Kewen.Lin via Gcc-patches wrote:
> 
> It's a great idea, by following your subsequent suggestion to make the 
> structure
> like:
> 
>   - calculate peel_iters_prologue
>   - calculate peel_iters_epilogue
>   - add costs associated with peel_iters_prologue
>   - add costs associated with peel_iters_epilogue
>   - add costs related to branch taken/not_taken.
> 
> the updated v3 is attached.
> 
> Just bootstrapped/regtested on powerpc64le-linux-gnu (P9) with explicit
> param vect-partial-vector-usage=1, I'll test it without partial vectors
> setting, also on aarch64 later.
> 
> BR,
> Kewen
> -
> gcc/ChangeLog:
> 
>   * config/rs6000/rs6000.c (adjust_vect_cost_per_loop): New function.
>   (rs6000_finish_cost): Call adjust_vect_cost_per_loop.
>   * tree-vect-loop.c (vect_get_known_peeling_cost): Factor out some code
>   to determine peel_iters_epilogue to function ...
>   (vect_get_peel_iters_epilogue): ... this.  New function.
>   (vect_estimate_min_profitable_iters):  Add cost modeling for vector
>   with length, refactor cost calculation on peel_iters_prologue and
>   peel_iters_epilogue.
> 

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 009afc5f894..d71f2bf1c16 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -5177,6 +5177,34 @@ rs6000_add_stmt_cost (class vec_info *vinfo, void *data, 
int count,
   return retval;
 }
 
+/* For some target specific vectorization cost which can't be handled per stmt,
+   we check the requisite conditions and adjust the vectorization cost
+   accordingly if satisfied.  One typical example is to model shift cost for
+   vector with length by counting number of required lengths under condition
+   LOOP_VINFO_FULLY_WITH_LENGTH_P.  */
+
+static void
+adjust_vect_cost_per_loop (rs6000_cost_data *data)
+{
+  struct loop *loop = data->loop_info;
+  gcc_assert (loop);
+  loop_vec_info loop_vinfo = loop_vec_info_for_loop (loop);
+
+  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+{
+  rgroup_controls *rgc;
+  unsigned int num_vectors_m1;
+  unsigned int shift_cnt = 0;
+  FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
+   if (rgc->type)
+ /* Each length needs one shift to fill into bits 0-7.  */
+ shift_cnt += (num_vectors_m1 + 1);
+
+  rs6000_add_stmt_cost (loop_vinfo, (void *) data, shift_cnt, scalar_stmt,
+   NULL, NULL_TREE, 0, vect_body);
+}
+}
+
 /* Implement targetm.vectorize.finish_cost.  */
 
 static void
@@ -5186,7 +5214,10 @@ rs6000_finish_cost (void *data, unsigned *prologue_cost,
   rs6000_cost_data *cost_data = (rs6000_cost_data*) data;
 
   if (cost_data->loop_info)
-rs6000_density_test (cost_data);
+{
+  adjust_vect_cost_per_loop (cost_data);
+  rs6000_density_test (cost_data);
+}
 
   /* Don't vectorize minimum-vectorization-factor, simple copy loops
  that require versioning for any reason.  The vectorization is at
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index e933441b922..8746c5ae582 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -3474,42 +3474,56 @@ vect_is_simple_reduction (loop_vec_info loop_info, 
stmt_vec_info phi_info,
   return NULL;
 }
 
-/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
-int
-vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
- int *peel_iters_epilogue,
- stmt_vector_for_cost *scalar_cost_vec,
-stmt_vector_for_cost *prologue_cost_vec,
-stmt_vector_for_cost *epilogue_cost_vec)
+/* Calculate how many iterations peeled for epilogue with information 
LOOP_VINFO
+   and PEEL_ITERS_PROLOGUE.  */
+
+static int
+vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int 
peel_iters_prologue)
 {
-  int retval = 0;
   int assumed_vf = vect_vf_for_cost (loop_vinfo);
-
   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
 {
-  *peel_iters_epilogue = assumed_vf / 2;
   if (dump_enabled_p ())
-dump_printf_loc (MSG_NOTE, vect_location,
+   dump_printf_loc (MSG_NOTE, vect_location,
 "cost model: epilogue peel iters set to vf/2 "
 "because loop iterations are unknown .\n");
-
-  /* If peeled iterations are known but number of scalar loop
- iterations are unknown, count a taken branch per peeled loop.  */
-  retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
-NULL, NULL_TREE, 0, vect_prologue);
-  retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
- NULL, 

Re: [PATCH 3/4] libstdc++: Add floating-point std::to_chars implementation

2020-07-22 Thread Patrick Palka via Gcc-patches
On Mon, 20 Jul 2020, Patrick Palka wrote:

> On Mon, 20 Jul 2020, Jonathan Wakely wrote:
> 
> > On 20/07/20 08:53 -0400, Patrick Palka via Libstdc++ wrote:
> > > On Mon, 20 Jul 2020, Jonathan Wakely wrote:
> > > 
> > > > On 19/07/20 23:37 -0400, Patrick Palka via Libstdc++ wrote:
> > > > > On Fri, 17 Jul 2020, Patrick Palka wrote:
> > > > >
> > > > > > On Fri, 17 Jul 2020, Patrick Palka wrote:
> > > > > >
> > > > > > > On Wed, 15 Jul 2020, Patrick Palka wrote:
> > > > > > >
> > > > > > > > On Tue, 14 Jul 2020, Patrick Palka wrote:
> > > > > > > >
> > > > > > > > > This implements the floating-point std::to_chars overloads for
> > > > > > float,
> > > > > > > > > double and long double.  We use the Ryu library to compute the
> > > > > > shortest
> > > > > > > > > round-trippable fixed and scientific forms of a number for
> > > > float,
> > > > > > double
> > > > > > > > > and long double.  We also use Ryu for performing fixed and
> > > > > > scientific
> > > > > > > > > formatting of float and double. For formatting long double 
> > > > > > > > > with
> > > > an
> > > > > > > > > explicit precision argument we use a printf fallback.
> > > > Hexadecimal
> > > > > > > > > formatting for float, double and long double is implemented 
> > > > > > > > > from
> > > > > > > > > scratch.
> > > > > > > > >
> > > > > > > > > The supported long double binary formats are float64 (same as
> > > > > > double),
> > > > > > > > > float80 (x86 extended precision), float128 and ibm128.
> > > > > > > > >
> > > > > > > > > Much of the complexity of the implementation is in computing 
> > > > > > > > > the
> > > > > > exact
> > > > > > > > > output length before handing it off to Ryu (which doesn't do
> > > > bounds
> > > > > > > > > checking).  In some cases it's hard to compute the output 
> > > > > > > > > length
> > > > > > before
> > > > > > > > > the fact, so in these cases we instead compute an upper bound 
> > > > > > > > > on
> > > > the
> > > > > > > > > output length and use a sufficiently-sized intermediate buffer
> > > > (if
> > > > > > the
> > > > > > > > > output range is smaller than the upper bound).
> > > > > > > > >
> > > > > > > > > Another source of complexity is in the general-with-precision
> > > > > > formatting
> > > > > > > > > mode, where we need to do zero-trimming of the string returned
> > > > by
> > > > > > Ryu, and
> > > > > > > > > where we also take care to avoid having to format the string a
> > > > > > second
> > > > > > > > > time when the general formatting mode resolves to fixed.
> > > > > > > > >
> > > > > > > > > Tested on x86_64-pc-linux-gnu, aarch64-unknown-linux-gnu,
> > > > > > > > > s390x-ibm-linux-gnu, and powerpc64-unknown-linux-gnu.
> > > > > > > > >
> > > > > > > > > libstdc++-v3/ChangeLog:
> > > > > > > > >
> > > > > > > > >   * acinclude.m4 (libtool_VERSION): Bump to 6:29:0.
> > > > > > > > >   * config/abi/pre/gnu.ver: Add new exports.
> > > > > > > > >   * configure: Regenerate.
> > > > > > > > >   * include/std/charconv (to_chars): Declare the 
> > > > > > > > > floating-point
> > > > > > > > >   overloads for float, double and long double.
> > > > > > > > >   * src/c++17/Makefile.am (sources): Add 
> > > > > > > > > floating_to_chars.cc.
> > > > > > > > >   * src/c++17/Makefile.in: Regenerate.
> > > > > > > > >   * src/c++17/floating_to_chars.cc: New file.
> > > > > > > > >   * testsuite/20_util/to_chars/long_double.cc: New test.
> > > > > > > > >   * testsuite/util/testsuite_abi.cc: Add new symbol 
> > > > > > > > > version.
> > > > > > > >
> > > > > > > > Here is v2 of this patch, which fixes a build failure on i386 
> > > > > > > > due
> > > > to
> > > > > > > > __int128 being unavailable, by refactoring the long double 
> > > > > > > > binary
> > > > > > format
> > > > > > > > selection to avoid referring to __int128 when it doesn't exist.
> > > > The
> > > > > > > > patch also makes the hex formatting for 80-bit long double use
> > > > > > uint64_t
> > > > > > > > instead of __int128 since the mantissa has exactly 64 bits in 
> > > > > > > > this
> > > > > > case.
> > > > > > >
> > > > > > > Here's v3 which just makes some minor stylistic adjustments, and
> > > > most
> > > > > > > notably replaces the use of _GLIBCXX_DEBUG with 
> > > > > > > _GLIBCXX_ASSERTIONS
> > > > > > > since we just want to enable __glibcxx_assert and not all of debug
> > > > mode.
> > > > > >
> > > > > > Here's v4, which should now correctly support using  with
> > > > > > -mlong-double-64 on targets with a large default long double type.
> > > > > > This is done by defining the long double to_chars overloads as 
> > > > > > inline
> > > > > > wrappers around the double overloads within  whenever
> > > > > > __DBL_MANT_DIG__ equals __LDBL_MANT_DIG__.
> > > > >
> > > > > >
> > > > > > -- >8 --
> > > > > >
> > > > > > Subject: [PATCH 3/4] libstdc++: Add floating-point std::to_chars
> > > > > >  implementation
> > > > > >
> > > > > > This implements the 

Re: [PATCH] rs6000: Rename adjust_vectorization_cost

2020-07-22 Thread Kewen.Lin via Gcc-patches
Hi Segher,

on 2020/7/22 下午4:26, Segher Boessenkool wrote:
> Hi!
> 
> On Wed, Jul 22, 2020 at 09:44:52AM +0800, Kewen.Lin wrote:
>> This trivial patch is to rename adjust_vectorization_cost to 
>> adjust_vect_cost_per_stmt.  Hope it's more meaningful, as well
>> as to avoid the confusion between the possible to be landed
>> function "adjust_vect_cost" and "adjust_vectorization_cost".
>>
>> Even without "adjust_vect_cost", I guess it's still good?
> 
> It is an improvement for sure, so it is okay for trunk of course.  It
> still isn't very clear from the name how this would differ from
> adjust_vect_cost, but that _is_ obviously the more generic name, so
> that is good.
> 

Thanks!  To make adjust_vect_cost more meaningful, I just updated it
to adjust_vect_cost_per_loop in the latest version, hope it gets
a bit better.  :-)

BR,
Kewen


[Patch] OpenMP: Support 'if (simd:/cancel:' in Fortran

2020-07-22 Thread Tobias Burnus

Adds support for the simd and cancel modifier of
'if (: logical_expr)', which is already
supported in C/C++.

OK?

Tobias

-
Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München / Germany
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Alexander 
Walter
OpenMP: Support 'if (simd:/cancel:' in Fortran

gcc/fortran/ChangeLog:

	* gfortran.h (enum gfc_omp_if_kind): Add OMP_IF_CANCEL and OMP_IF_SIMD.
	* openmp.c (OMP_SIMD_CLAUSES): Add OMP_CLAUSE_IF.
	(gfc_match_omp_clauses, resolve_omp_clauses): Handle 'if (simd/cancel:'.
	* dump-parse-tree.c (show_omp_clauses): Likewise.
	* trans-openmp.c (gfc_trans_omp_clauses, gfc_trans_omp_cancel,
	(gfc_split_omp_clauses): Likewise.

gcc/testsuite/ChangeLog:

	* gfortran.dg/gomp/cancel-2.f90: New test.
	* gfortran.dg/gomp/cancel-3.f90: New test.
	* gfortran.dg/gomp/if-1.f90: New test.

 gcc/fortran/dump-parse-tree.c   |   2 +
 gcc/fortran/gfortran.h  |   2 +
 gcc/fortran/openmp.c|  46 +--
 gcc/fortran/trans-openmp.c  |  17 +++-
 gcc/testsuite/gfortran.dg/gomp/cancel-2.f90 |  15 
 gcc/testsuite/gfortran.dg/gomp/cancel-3.f90 |  35 
 gcc/testsuite/gfortran.dg/gomp/if-1.f90 | 122 
 7 files changed, 229 insertions(+), 10 deletions(-)

diff --git a/gcc/fortran/dump-parse-tree.c b/gcc/fortran/dump-parse-tree.c
index f9a6bf4f1f8..2a02bc871bc 100644
--- a/gcc/fortran/dump-parse-tree.c
+++ b/gcc/fortran/dump-parse-tree.c
@@ -1693,7 +1693,9 @@ show_omp_clauses (gfc_omp_clauses *omp_clauses)
 if (omp_clauses->if_exprs[i])
   {
 	static const char *ifs[] = {
+	  "CANCEL",
 	  "PARALLEL",
+	  "SIMD",
 	  "TASK",
 	  "TASKLOOP",
 	  "TARGET",
diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h
index 264822ef9f8..1648831736c 100644
--- a/gcc/fortran/gfortran.h
+++ b/gcc/fortran/gfortran.h
@@ -1320,7 +1320,9 @@ enum gfc_omp_cancel_kind
 
 enum gfc_omp_if_kind
 {
+  OMP_IF_CANCEL,
   OMP_IF_PARALLEL,
+  OMP_IF_SIMD,
   OMP_IF_TASK,
   OMP_IF_TASKLOOP,
   OMP_IF_TARGET,
diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c
index 58552af0982..e89ae295a31 100644
--- a/gcc/fortran/openmp.c
+++ b/gcc/fortran/openmp.c
@@ -1303,7 +1303,9 @@ gfc_match_omp_clauses (gfc_omp_clauses **cp, const omp_mask mask,
 		{
 		  /* This should match the enum gfc_omp_if_kind order.  */
 		  static const char *ifs[OMP_IF_LAST] = {
+		" cancel : %e )",
 		" parallel : %e )",
+		" simd : %e )",
 		" task : %e )",
 		" taskloop : %e )",
 		" target : %e )",
@@ -2568,7 +2570,8 @@ cleanup:
 #define OMP_SIMD_CLAUSES \
   (omp_mask (OMP_CLAUSE_PRIVATE) | OMP_CLAUSE_LASTPRIVATE		\
| OMP_CLAUSE_REDUCTION | OMP_CLAUSE_COLLAPSE | OMP_CLAUSE_SAFELEN	\
-   | OMP_CLAUSE_LINEAR | OMP_CLAUSE_ALIGNED | OMP_CLAUSE_SIMDLEN)
+   | OMP_CLAUSE_LINEAR | OMP_CLAUSE_ALIGNED | OMP_CLAUSE_SIMDLEN	\
+   | OMP_CLAUSE_IF)
 #define OMP_TASK_CLAUSES \
   (omp_mask (OMP_CLAUSE_PRIVATE) | OMP_CLAUSE_FIRSTPRIVATE		\
| OMP_CLAUSE_SHARED | OMP_CLAUSE_IF | OMP_CLAUSE_DEFAULT		\
@@ -4133,33 +4136,53 @@ resolve_omp_clauses (gfc_code *code, gfc_omp_clauses *omp_clauses,
 	else
 	  switch (code->op)
 	{
+	case EXEC_OMP_CANCEL:
+	  ok = ifc == OMP_IF_CANCEL;
+	  break;
+
 	case EXEC_OMP_PARALLEL:
 	case EXEC_OMP_PARALLEL_DO:
 	case EXEC_OMP_PARALLEL_SECTIONS:
 	case EXEC_OMP_PARALLEL_WORKSHARE:
-	case EXEC_OMP_PARALLEL_DO_SIMD:
 	case EXEC_OMP_DISTRIBUTE_PARALLEL_DO:
-	case EXEC_OMP_DISTRIBUTE_PARALLEL_DO_SIMD:
 	case EXEC_OMP_TEAMS_DISTRIBUTE_PARALLEL_DO:
-	case EXEC_OMP_TEAMS_DISTRIBUTE_PARALLEL_DO_SIMD:
 	  ok = ifc == OMP_IF_PARALLEL;
 	  break;
 
+	case EXEC_OMP_PARALLEL_DO_SIMD:
+	case EXEC_OMP_DISTRIBUTE_PARALLEL_DO_SIMD:
+	case EXEC_OMP_TEAMS_DISTRIBUTE_PARALLEL_DO_SIMD:
+	  ok = ifc == OMP_IF_PARALLEL || ifc == OMP_IF_SIMD;
+	  break;
+
+	case EXEC_OMP_SIMD:
+	case EXEC_OMP_DO_SIMD:
+	case EXEC_OMP_DISTRIBUTE_SIMD:
+	case EXEC_OMP_TEAMS_DISTRIBUTE_SIMD:
+	  ok = ifc == OMP_IF_SIMD;
+	  break;
+
 	case EXEC_OMP_TASK:
 	  ok = ifc == OMP_IF_TASK;
 	  break;
 
 	case EXEC_OMP_TASKLOOP:
-	case EXEC_OMP_TASKLOOP_SIMD:
 	  ok = ifc == OMP_IF_TASKLOOP;
 	  break;
 
+	case EXEC_OMP_TASKLOOP_SIMD:
+	  ok = ifc == OMP_IF_TASKLOOP || ifc == OMP_IF_SIMD;
+	  break;
+
 	case EXEC_OMP_TARGET:
 	case EXEC_OMP_TARGET_TEAMS:
 	case EXEC_OMP_TARGET_TEAMS_DISTRIBUTE:
+	  ok = ifc == OMP_IF_TARGET;
+	  break;
+
 	case EXEC_OMP_TARGET_TEAMS_DISTRIBUTE_SIMD:
 	case EXEC_OMP_TARGET_SIMD:
-	  ok = ifc == OMP_IF_TARGET;
+	  ok = ifc == OMP_IF_TARGET || ifc == OMP_IF_SIMD;
 	  break;
 
 	case EXEC_OMP_TARGET_DATA:
@@ -4179,13 +4202,18 @@ resolve_omp_clauses (gfc_code *code, gfc_omp_clauses *omp_clauses,
 	  break;
 
 	case 

[PATCH v3] vect/rs6000: Support vector with length cost modeling

2020-07-22 Thread Kewen.Lin via Gcc-patches
Hi Richard,

Thanks for the review!

on 2020/7/22 下午5:11, Richard Sandiford wrote:
> "Kewen.Lin"  writes:
>> -  else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>> -{
>> -  peel_iters_prologue = 0;
>> -  peel_iters_epilogue = 0;
>> +  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>> +{
>> +  /* Calculate how many masks we need to generate.  */
>> +  unsigned int num_masks = 0;
>> +  rgroup_controls *rgm;
>> +  unsigned int num_vectors_m1;
>> +  FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
>> +if (rgm->type)
>> +  num_masks += num_vectors_m1 + 1;
>> +  gcc_assert (num_masks > 0);
>> +
>> +  /* In the worst case, we need to generate each mask in the prologue
>> + and in the loop body.  One of the loop body mask instructions
>> + replaces the comparison in the scalar loop, and since we don't
>> + count the scalar comparison against the scalar body, we shouldn't
>> + count that vector instruction against the vector body either.
>> +
>> + Sometimes we can use unpacks instead of generating prologue
>> + masks and sometimes the prologue mask will fold to a constant,
>> + so the actual prologue cost might be smaller.  However, it's
>> + simpler and safer to use the worst-case cost; if this ends up
>> + being the tie-breaker between vectorizing or not, then it's
>> + probably better not to vectorize.  */
>> +  (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
>> +vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
>> +  (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
>> +vector_stmt, NULL, NULL_TREE, 0, vect_body);
>> +}
>> +  else
>> +{
>> +  gcc_assert (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo));
>> +
>> +  /* Consider cost for LOOP_VINFO_PEELING_FOR_ALIGNMENT.  */
>> +  if (npeel < 0)
>> +{
>> +  peel_iters_prologue = assumed_vf / 2;
>> +  /* See below, if peeled iterations are unknown, count a taken
>> + branch and a not taken branch per peeled loop.  */
>> +  (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
>> +cond_branch_taken, NULL, NULL_TREE, 0,
>> +vect_prologue);
>> +  (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
>> +cond_branch_not_taken, NULL, NULL_TREE, 0,
>> +vect_prologue);
>> +}
>> +  else
>> +{
>> +  peel_iters_prologue = npeel;
>> +  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
>> +/* See vect_get_known_peeling_cost, if peeled iterations are
>> +   known but number of scalar loop iterations are unknown, count
>> +   a taken branch per peeled loop.  */
>> +(void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
>> +  cond_branch_taken, NULL, NULL_TREE, 0,
>> +  vect_prologue);
>> +}
> 
> I think it'd be good to avoid duplicating this.  How about the
> following structure?
> 
>   if (vect_use_loop_mask_for_alignment_p (…))
> {
>   peel_iters_prologue = 0;
>   peel_iters_epilogue = 0;
> }
>   else if (npeel < 0)
> {
>   … // A
> }
>   else
> {
>   …vect_get_known_peeling_cost stuff…
> }
> 
> but in A and vect_get_known_peeling_cost, set peel_iters_epilogue to:
> 
>   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0
> 
> for LOOP_VINFO_USING_PARTIAL_VECTORS_P, instead of setting it to
> whatever value we'd normally use.  Then wrap:
> 
>   (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, 
> cond_branch_taken,
>   NULL, NULL_TREE, 0, vect_epilogue);
>   (void) add_stmt_cost (loop_vinfo,
>   target_cost_data, 1, cond_branch_not_taken,
>   NULL, NULL_TREE, 0, vect_epilogue);
> 
> in !LOOP_VINFO_USING_PARTIAL_VECTORS_P and make the other vect_epilogue
> stuff in A conditional on peel_iters_epilogue != 0.
> 
> This will also remove the need for the existing LOOP_VINFO_FULLY_MASKED_P
> code:
> 
>   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
>   {
> /* We need to peel exactly one iteration.  */
> peel_iters_epilogue += 1;
> stmt_info_for_cost *si;
> int j;
> FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
>   j, si)
>   (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
> si->kind, si->stmt_info, si->vectype,
> si->misalign, vect_epilogue);
>   }
> 
> Then, after the above, have:
> 
>   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> …add costs for mask overhead…
>   else if 

c++: Don't add enums to class's decl_list

2020-07-22 Thread Nathan Sidwell
We don't need to add CONST_DECLs to a template decl's decl list.  Also 
made the code flow a bit clearer.


gcc/cp/
* class.c (maybe_add_class_template_decl_list): Don't add 
CONST_DECLs.


nathan

--
Nathan Sidwell
diff --git i/gcc/cp/class.c w/gcc/cp/class.c
index 803b33bf346..a3913f4ce0b 100644
--- i/gcc/cp/class.c
+++ w/gcc/cp/class.c
@@ -3049,11 +3049,14 @@ finish_struct_anon (tree t)
 void
 maybe_add_class_template_decl_list (tree type, tree t, int friend_p)
 {
-  /* Save some memory by not creating TREE_LIST if TYPE is not template.  */
-  if (CLASSTYPE_TEMPLATE_INFO (type))
-CLASSTYPE_DECL_LIST (type)
-  = tree_cons (friend_p ? NULL_TREE : type,
-		   t, CLASSTYPE_DECL_LIST (type));
+  if (CLASSTYPE_TEMPLATE_INFO (type)
+  && TREE_CODE (t) != CONST_DECL)
+{
+  tree purpose = friend_p ? NULL_TREE : type;
+
+  CLASSTYPE_DECL_LIST (type)
+	= tree_cons (purpose, t, CLASSTYPE_DECL_LIST (type));
+}
 }
 
 /* This function is called from declare_virt_assop_and_dtor via


dumpfile: filenames containing -

2020-07-22 Thread Nathan Sidwell


I discovered the dump machinery would get confused by filenames 
containing '-'. Fixed thusly as obvious.


gcc/
* dumpfile.c (parse_dump_option): Deal with filenames
containing '-'

nathan
--
Nathan Sidwell
diff --git i/gcc/dumpfile.c w/gcc/dumpfile.c
index 9a5496a18e8..8e53aeff340 100644
--- i/gcc/dumpfile.c
+++ w/gcc/dumpfile.c
@@ -1800,7 +1800,7 @@ parse_dump_option (const char *option_value, const char **pos_p)
   end_ptr = strchr (ptr, '-');
   eq_ptr = strchr (ptr, '=');
 
-  if (eq_ptr && !end_ptr)
+  if (eq_ptr && (!end_ptr || end_ptr > eq_ptr))
 	end_ptr = eq_ptr;
 
   if (!end_ptr)


c++: structural_comptypes addition

2020-07-22 Thread Nathan Sidwell
I had to debug structural_comptypes, and its complex if conditions and 
tail calling of same_type_p made that hard.  I'd hope we can turn the 
eqivalent of return boolean_fn () ? true : false; into a tail call of 
the boolean.  We also were not dealing with TYPEOF_TYPE.


gcc/cp/
* typeck.c (structural_comptypes): [DECLTYPE_TYPE] break
apart complex if.
[UNDERLYING_TYPE]: Use an if.
[TYPEOF_TYPE]: New.

nathan
--
Nathan Sidwell
diff --git i/gcc/cp/typeck.c w/gcc/cp/typeck.c
index 589e014f855..adc088ce1d7 100644
--- i/gcc/cp/typeck.c
+++ w/gcc/cp/typeck.c
@@ -1452,19 +1452,25 @@ structural_comptypes (tree t1, tree t2, int strict)
 
 case DECLTYPE_TYPE:
   if (DECLTYPE_TYPE_ID_EXPR_OR_MEMBER_ACCESS_P (t1)
-  != DECLTYPE_TYPE_ID_EXPR_OR_MEMBER_ACCESS_P (t2)
-	  || (DECLTYPE_FOR_LAMBDA_CAPTURE (t1)
-	  != DECLTYPE_FOR_LAMBDA_CAPTURE (t2))
-	  || (DECLTYPE_FOR_LAMBDA_PROXY (t1)
-	  != DECLTYPE_FOR_LAMBDA_PROXY (t2))
-  || !cp_tree_equal (DECLTYPE_TYPE_EXPR (t1), 
- DECLTYPE_TYPE_EXPR (t2)))
+  != DECLTYPE_TYPE_ID_EXPR_OR_MEMBER_ACCESS_P (t2))
+	return false;
+  if (DECLTYPE_FOR_LAMBDA_CAPTURE (t1) != DECLTYPE_FOR_LAMBDA_CAPTURE (t2))
+	return false;
+  if (DECLTYPE_FOR_LAMBDA_PROXY (t1) != DECLTYPE_FOR_LAMBDA_PROXY (t2))
+	return false;
+  if (!cp_tree_equal (DECLTYPE_TYPE_EXPR (t1), DECLTYPE_TYPE_EXPR (t2)))
 return false;
   break;
 
 case UNDERLYING_TYPE:
-  return same_type_p (UNDERLYING_TYPE_TYPE (t1), 
-			  UNDERLYING_TYPE_TYPE (t2));
+  if (!same_type_p (UNDERLYING_TYPE_TYPE (t1), UNDERLYING_TYPE_TYPE (t2)))
+	return false;
+  break;
+
+case TYPEOF_TYPE:
+  if (!cp_tree_equal (TYPEOF_TYPE_EXPR (t1), TYPEOF_TYPE_EXPR (t2)))
+	return false;
+  break;
 
 default:
   return false;


Re: [PATCH v3] dse: Remove partial load after full store for high part access[PR71309]

2020-07-22 Thread luoxhu via Gcc-patches
Hi,

On 2020/7/22 19:05, Richard Sandiford wrote:
> This wasn't really what I meant.  Using subregs is fine, but I was
> thinking of:
> 
>/* Also try a wider mode if the necessary punning is either not
>desirable or not possible.  */
>if (!CONSTANT_P (store_info->rhs)
> && !targetm.modes_tieable_p (new_mode, store_mode))
>   continue;
> 
>if (multiple_p (shift, GET_MODE_BITSIZE (new_mode)))
>   {
> /* Try to implement the shift using a subreg.  */
> poly_int64 offset = subreg_offset_from_lsb (new_mode, store_mode,
> shift);
> rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
>   store_mode, offset);
> if (rhs_subreg)
>   {
> ...
> break;
>   }
>   }
> 
> where the rhs_subreg is from your original patch.
> 
> The multiple_p should be that way round: the shift needs to be a
> multiple of the new_mode for the subreg to be valid.
> 
> I think this should also avoid the BITS_PER_WORD problem.  On the
> other hand, I agree BITS_PER_UNIT isn't a very sensible limit if
> we're using subregs, so maybe moving it to after the multiple_p
> if block would still make sense.
> 

Thanks, I took that rhs_subreg part back for the v3 patch and updated a bit
based on your prototype, shift should be put in op1 as multiple_p requires
op0 >= op1. 

Then, new_mode is still TImode same to store_mode, offset will return 8 when
shift is 64,  simplify_gen_subreg needs an additional inner_mode(DImode) 
generated from "smallest_int_mode_for_size (shift)" to get rhs_subreg, 
otherwise it will return NULL if new_mode is equal to store_mode.

Lastly, move the BITS_PER_UNIT after multiple_p as it still need generate
shift_seq for other circumstances. :)


[PATCH v3] dse: Remove partial load after full store for high part 
access[PR71309]


This patch could optimize (works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
  ld 10,16(3)
  mr 9,3
  ld 3,24(3)
  std 10,0(9)
  std 3,8(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp x2, x3, [x0, 16]
stp x2, x3, [x0]
ldr x0, [x0, 8]

=>

mov x1, x0
ldp x2, x0, [x0, 16]
stp x2, x0, [x1]

gcc/ChangeLog:

2020-07-22  Xionghu Luo  

PR rtl-optimization/71309
* dse.c (find_shift_sequence): Use subreg of shifted from high part
register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-22  Xionghu Luo  

PR rtl-optimization/71309
* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c  | 21 --
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++
 2 files changed, 52 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..aaa161237c3 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1736,8 +1736,6 @@ find_shift_sequence (poly_int64 access_size,
   int cost;
 
   new_mode = new_mode_iter.require ();
-  if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
-   break;
 
   /* If a constant was stored into memory, try to simplify it here,
 otherwise the cost of the shift might preclude this optimization
@@ -1779,6 +1777,25 @@ find_shift_sequence (poly_int64 access_size,
  && !targetm.modes_tieable_p (new_mode, store_mode))
continue;
 
+  if (multiple_p (GET_MODE_BITSIZE (new_mode), shift))
+   {
+ /* Try to implement the shift using a subreg.  */
+ scalar_int_mode inner_mode = smallest_int_mode_for_size (shift);
+ poly_int64 offset
+   = subreg_offset_from_lsb (new_mode, store_mode, shift);
+ rtx rhs_subreg = simplify_gen_subreg (inner_mode, store_info->rhs,
+   store_mode, offset);
+ if (rhs_subreg)
+   {
+ read_reg = extract_low_bits (read_mode, inner_mode,
+  copy_rtx (rhs_subreg));
+ break;
+   }
+   }
+
+  if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
+   break;
+
   new_reg = gen_reg_rtx (new_mode);
 
   start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c 
b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define 

[PATCH] [og10] Fix gfortran.dg/goacc/routine-module-mod-1.f90 testcase

2020-07-22 Thread Kwok Cheung Yeung
The testcase gfortran.dg/goacc/routine-module-mod-1.f90 fails due to an extra 
'warning: region is worker partitioned but does not contain worker partitioned 
code' message in subroutine g_1. subroutine g_1 is marked with '!$acc routine 
gang', but the loop inside is only assigned gang vector loop parallelism, which 
triggers the message as there is no worker parallelism.


This patch makes the message expected. Okay for OG10 branch?

Kwok
From 824a4d600380a8b02bb65f055ff0423bbd849a4f Mon Sep 17 00:00:00 2001
From: Kwok Cheung Yeung 
Date: Wed, 1 Jul 2020 08:26:42 -0700
Subject: [PATCH 6/6] Fix failure in testcase
 gfortran.dg/goacc/routine-module-mod-1.f90

2020-07-21  Kwok Cheung Yeung  

gcc/testsuite/
* gfortran.dg/goacc/routine-module-mod-1.f90 (g_1): Add
expected output.
---
 gcc/testsuite/gfortran.dg/goacc/routine-module-mod-1.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gfortran.dg/goacc/routine-module-mod-1.f90 
b/gcc/testsuite/gfortran.dg/goacc/routine-module-mod-1.f90
index 23c673f..2ea4094 100644
--- a/gcc/testsuite/gfortran.dg/goacc/routine-module-mod-1.f90
+++ b/gcc/testsuite/gfortran.dg/goacc/routine-module-mod-1.f90
@@ -50,7 +50,7 @@ contains
 end do
   end subroutine w_1
 
-  subroutine g_1
+  subroutine g_1 ! { dg-warning "region is worker partitioned but does not 
contain worker partitioned code" }
 implicit none
 !$acc routine gang
 
-- 
2.8.1



c++: More modern C++ variable decls

2020-07-22 Thread Nathan Sidwell
Here are some more places where we can declare variables at the 
assignment point, rather than use C89.  Also, let's name our variables 
by what they contain -- the register allocator is perfectly able to 
track liveness for us.


gcc/cp/
* decl.c (decls_match): Move variables into scopes
they're needed in.
(duplicate_decls): Use STRIP_TEMPLATE.
(build_typename_type): Move var decls to their assignments.
(begin_function_body): Likewise.
* decl2.c (get_guard): Likewise.
(mark_used): Use true for truthiness.
* error.c (dump_aggr_type): Hold the decl in a var called
'decl', not 'name'.


nathan
--
Nathan Sidwell
diff --git i/gcc/cp/decl.c w/gcc/cp/decl.c
index db91b50a4f0..24cfb8e4b27 100644
--- i/gcc/cp/decl.c
+++ w/gcc/cp/decl.c
@@ -974,12 +974,6 @@ decls_match (tree newdecl, tree olddecl, bool record_versions /* = true */)
 
   if (TREE_CODE (newdecl) == FUNCTION_DECL)
 {
-  tree f1 = TREE_TYPE (newdecl);
-  tree f2 = TREE_TYPE (olddecl);
-  tree p1 = TYPE_ARG_TYPES (f1);
-  tree p2 = TYPE_ARG_TYPES (f2);
-  tree r2;
-
   /* Specializations of different templates are different functions
 	 even if they have the same type.  */
   tree t1 = (DECL_USE_TEMPLATE (newdecl)
@@ -1002,14 +996,20 @@ decls_match (tree newdecl, tree olddecl, bool record_versions /* = true */)
 	  && DECL_EXTERN_C_P (olddecl) && !DECL_EXTERN_C_P (newdecl))
 	return 0;
 
+  tree f1 = TREE_TYPE (newdecl);
+  tree f2 = TREE_TYPE (olddecl);
   if (TREE_CODE (f1) != TREE_CODE (f2))
 	return 0;
 
   /* A declaration with deduced return type should use its pre-deduction
 	 type for declaration matching.  */
-  r2 = fndecl_declared_return_type (olddecl);
+  tree r2 = fndecl_declared_return_type (olddecl);
+  tree r1 = fndecl_declared_return_type (newdecl);
+
+  tree p1 = TYPE_ARG_TYPES (f1);
+  tree p2 = TYPE_ARG_TYPES (f2);
 
-  if (same_type_p (TREE_TYPE (f1), r2))
+  if (same_type_p (r1, r2))
 	{
 	  if (!prototype_p (f2) && DECL_EXTERN_C_P (olddecl)
 	  && fndecl_built_in_p (olddecl))
@@ -2457,14 +2457,7 @@ duplicate_decls (tree newdecl, tree olddecl, bool newdecl_is_friend)
 	  || (TREE_CODE (olddecl) == TEMPLATE_DECL
 		  && (TREE_CODE (DECL_TEMPLATE_RESULT (olddecl))
 		  == FUNCTION_DECL
-	{
-	  tree fn = olddecl;
-
-	  if (TREE_CODE (fn) == TEMPLATE_DECL)
-	fn = DECL_TEMPLATE_RESULT (olddecl);
-
-	  new_redefines_gnu_inline = GNU_INLINE_P (fn) && DECL_INITIAL (fn);
-	}
+	new_redefines_gnu_inline = GNU_INLINE_P (STRIP_TEMPLATE (olddecl));
 
   if (!new_redefines_gnu_inline)
 	{
@@ -3852,11 +3845,7 @@ tree
 build_typename_type (tree context, tree name, tree fullname,
 		 enum tag_types tag_type)
 {
-  tree t;
-  tree d;
   typename_info ti;
-  tree *e;
-  hashval_t hash;
 
   if (typename_htab == NULL)
 typename_htab = hash_table::create_ggc (61);
@@ -3868,11 +3857,12 @@ build_typename_type (tree context, tree name, tree fullname,
   ti.class_p = (tag_type == class_type
 		|| tag_type == record_type
 		|| tag_type == union_type);
-  hash =  (htab_hash_pointer (ti.scope)
-	   ^ htab_hash_pointer (ti.name));
+  hashval_t hash =  (htab_hash_pointer (ti.scope)
+		 ^ htab_hash_pointer (ti.name));
 
   /* See if we already have this type.  */
-  e = typename_htab->find_slot_with_hash (, hash, INSERT);
+  tree *e = typename_htab->find_slot_with_hash (, hash, INSERT);
+  tree t = *e;
   if (*e)
 t = *e;
   else
@@ -3885,10 +3875,10 @@ build_typename_type (tree context, tree name, tree fullname,
   TYPENAME_IS_CLASS_P (t) = ti.class_p;
 
   /* Build the corresponding TYPE_DECL.  */
-  d = build_decl (input_location, TYPE_DECL, name, t);
-  TYPE_NAME (TREE_TYPE (d)) = d;
-  TYPE_STUB_DECL (TREE_TYPE (d)) = d;
-  DECL_CONTEXT (d) = FROB_CONTEXT (context);
+  tree d = build_decl (input_location, TYPE_DECL, name, t);
+  TYPE_NAME (t) = d;
+  TYPE_STUB_DECL (t) = d;
+  DECL_CONTEXT (d) = ti.scope;
   DECL_ARTIFICIAL (d) = 1;
 
   /* Store it in the hash table.  */
@@ -4061,9 +4051,6 @@ tree
 make_unbound_class_template (tree context, tree name, tree parm_list,
 			 tsubst_flags_t complain)
 {
-  tree t;
-  tree d;
-
   if (TYPE_P (name))
 name = TYPE_IDENTIFIER (name);
   else if (DECL_P (name))
@@ -4108,16 +4095,16 @@ make_unbound_class_template (tree context, tree name, tree parm_list,
 }
 
   /* Build the UNBOUND_CLASS_TEMPLATE.  */
-  t = cxx_make_type (UNBOUND_CLASS_TEMPLATE);
+  tree t = cxx_make_type (UNBOUND_CLASS_TEMPLATE);
   TYPE_CONTEXT (t) = FROB_CONTEXT (context);
   TREE_TYPE (t) = NULL_TREE;
   SET_TYPE_STRUCTURAL_EQUALITY (t);
 
   /* Build the corresponding TEMPLATE_DECL.  */
-  d = build_decl (input_location, TEMPLATE_DECL, name, t);
-  TYPE_NAME (TREE_TYPE (d)) = d;
-  TYPE_STUB_DECL (TREE_TYPE (d)) = d;
-  DECL_CONTEXT (d) = FROB_CONTEXT 

c++: Shrink lambda-expr

2020-07-22 Thread Nathan Sidwell
I noticed the default capture mode and the discriminator both used ints. 
 That seems excessive.  This shrinks them to 8 bits and 16 bits 
respectively.  I suppose the discriminator could use the remaining 24 
bits of an int allocation unit, if we're worried about more that 64K 
lambdas per function.  I know, users are strange :)  On a 64 bit system 
this saves 64 bits, because we also had 32 bits of padding added.


gcc/cp/
* cp-tree.h (struct tree_lambda_expr): Shrink
default_capture_mode & discriminator.


--
Nathan Sidwell
diff --git i/gcc/cp/cp-tree.h w/gcc/cp/cp-tree.h
index a599f3bad1c..d43c53ae29a 100644
--- i/gcc/cp/cp-tree.h
+++ w/gcc/cp/cp-tree.h
@@ -1442,8 +1442,8 @@ struct GTY (()) tree_lambda_expr
   tree extra_scope;
   vec *pending_proxies;
   location_t locus;
-  enum cp_lambda_default_capture_mode_type default_capture_mode;
-  int discriminator;
+  enum cp_lambda_default_capture_mode_type default_capture_mode : 8;
+  short int discriminator;
 };
 
 /* Non-zero if this template specialization has access violations that


[PATCH] [RFC] vect: Fix infinite loop while determining peeling amount

2020-07-22 Thread Stefan Schulze Frielinghaus via Gcc-patches
This is a follow up to commit 5c9669a0e6c respectively discussion
https://gcc.gnu.org/pipermail/gcc-patches/2020-June/549132.html

In case that an alignment constraint is less than the size of a
corresponding scalar type, ensure that we advance at least by one
iteration.  For example, on s390x we have for a long double an alignment
constraint of 8 bytes whereas the size is 16 bytes.  Therefore,
TARGET_ALIGN / DR_SIZE equals zero resulting in an infinite loop which
can be reproduced by the following MWE:

extern long double *a;
extern double *b;
void fun(void) {
  for (int i = 0; i < 42; i++)
a[i] = b[i];
}

Increasing the number of peelings in each iteration at least by one
fixes the issue for me.  Any comments?

Bootstrapped and regtested on s390x.

gcc/ChangeLog:

* tree-vect-data-refs.c (vect_enhance_data_refs_alignment):
Ensure that loop variable npeel_tmp advances in each iteration.
---
 gcc/tree-vect-data-refs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index e35a215e042..a78ae61d1b0 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -1779,7 +1779,7 @@ vect_enhance_data_refs_alignment (loop_vec_info 
loop_vinfo)
 {
   vect_peeling_hash_insert (_htab, loop_vinfo,
dr_info, npeel_tmp);
- npeel_tmp += target_align / dr_size;
+ npeel_tmp += MAX (1, target_align / dr_size);
 }
 
  one_misalignment_known = true;
-- 
2.25.3



Re: [committed] libstdc++: Add std::from_chars for floating-point types

2020-07-22 Thread Jonathan Wakely via Gcc-patches

On 21/07/20 20:08 +, Joseph Myers wrote:

On Tue, 21 Jul 2020, Jonathan Wakely via Gcc-patches wrote:


I also noticed some strings give an underflow error with glibc's
strtod, but are valid for the Microsoft implementation. For example,
this one:
https://github.com/microsoft/STL/blob/master/tests/std/tests/P0067R5_charconv/double_from_chars_test_cases.hpp#L265

Without the final '1' digit glibc returns DBL_MIN, but with the final
'1' digit (so a number larger than DBL_MIN) it underflows. Is that
expected?


That's DBL_TRUE_MIN, not DBL_MIN.  The IEEE rule is that, with default
exception handling, an exact subnormal result does not raise the underflow
exception flag, whereas an inexact tiny result raises both inexact and
underflow flags; glibc mostly doesn't try to ensure an (exact) underflow
exception is signaled for the case of exact underflow with traps on that
exception enabled, only correct flags raised with default exception
handling.

(The way tininess is determined depends on the architecture.  glibc strtod
handles both cases, before-rounding and after-rounding architectures,
modulo oddities where implementations of some CPU architectures don't
appear to be consistent in their before-rounding / after-rounding choice.
Note that on after-rounding architectures it depends on the result after
rounding with normal precision but unbounded exponent range.  One
consequence of that is that the bound on the number of digits after the
decimal point that may need to be considered, beyond just knowing whether
any of them are nonzero, to determine the correctly rounded result and
exceptions, is e.g. 1076 for binary64, not 1075; that is, two digits are
needed beyond those needed for an exact representation of the least
subnormal value, although only one such digit is needed for the correctly
rounded result if you ignore after-rounding tininess detection.)


Aha, thanks very much for the explanation.




[PATCH] [og10] Fix goacc/loop-2-kernels.f95 testcase

2020-07-22 Thread Kwok Cheung Yeung
This test fails because the "'seq' overrides other OpenACC loop specifiers" 
error is not appearing in the compiler output. The C-equivalent version of the 
test (c-c++-common/goacc/loop-2-kernels.c) has these tests XFAILed in the commit 
'Make new OpenACC kernels conversion the default; adjust and add tests' (commit 
757f56ddc43fd80bb8740222ec352111b26d66e9), so the Fortran version should 
probably be XFAILed too.


Okay for OG10?

Kwok
From 87cf165b9b45f4cedd9cda362d9238486024a527 Mon Sep 17 00:00:00 2001
From: Kwok Cheung Yeung 
Date: Thu, 25 Jun 2020 12:05:18 -0700
Subject: [PATCH 5/6] XFAIL tests in gfortran.dg/goacc/loop-2-kernels.f95

2020-07-21  Kwok Cheung Yeung  

gcc/testsuite/
* gfortran.dg/goacc/loop-2-kernels.f95: Add XFAILs.
---
 gcc/testsuite/gfortran.dg/goacc/loop-2-kernels.f95 | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/gcc/testsuite/gfortran.dg/goacc/loop-2-kernels.f95 
b/gcc/testsuite/gfortran.dg/goacc/loop-2-kernels.f95
index 874c62d..a4cf11c 100644
--- a/gcc/testsuite/gfortran.dg/goacc/loop-2-kernels.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/loop-2-kernels.f95
@@ -35,7 +35,7 @@ program test
   DO j = 1,10
   ENDDO
 ENDDO
-!$acc loop seq gang ! { dg-error "'seq' overrides other OpenACC loop 
specifiers" }
+!$acc loop seq gang ! { dg-error "'seq' overrides other OpenACC loop 
specifiers" "TODO" { xfail *-*-* } }
 DO i = 1,10
 ENDDO
 
@@ -60,7 +60,7 @@ program test
   DO j = 1,10
   ENDDO
 ENDDO
-!$acc loop seq worker ! { dg-error "'seq' overrides other OpenACC loop 
specifiers" }
+!$acc loop seq worker ! { dg-error "'seq' overrides other OpenACC loop 
specifiers" "TODO" { xfail *-*-* } }
 DO i = 1,10
 ENDDO
 !$acc loop gang worker
@@ -88,7 +88,7 @@ program test
   DO j = 1,10
   ENDDO
 ENDDO
-!$acc loop seq vector ! { dg-error "'seq' overrides other OpenACC loop 
specifiers" }
+!$acc loop seq vector ! { dg-error "'seq' overrides other OpenACC loop 
specifiers" "TODO" { xfail *-*-* } }
 DO i = 1,10
 ENDDO
 !$acc loop gang vector
@@ -101,7 +101,7 @@ program test
 !$acc loop auto
 DO i = 1,10
 ENDDO
-!$acc loop seq auto ! { dg-error "'seq' overrides other OpenACC loop 
specifiers" }
+!$acc loop seq auto ! { dg-error "'seq' overrides other OpenACC loop 
specifiers" "TODO" { xfail *-*-* } }
 DO i = 1,10
 ENDDO
 !$acc loop gang auto ! { dg-error "'auto' conflicts with other OpenACC 
loop specifiers" }
@@ -133,7 +133,7 @@ program test
   !$acc kernels loop gang(static:*)
   DO i = 1,10
   ENDDO
-  !$acc kernels loop seq gang ! { dg-error "'seq' overrides other OpenACC loop 
specifiers" }
+  !$acc kernels loop seq gang ! { dg-error "'seq' overrides other OpenACC loop 
specifiers" "TODO" { xfail *-*-* } }
   DO i = 1,10
   ENDDO
 
@@ -146,7 +146,7 @@ program test
   !$acc kernels loop worker(num:5)
   DO i = 1,10
   ENDDO
-  !$acc kernels loop seq worker ! { dg-error "'seq' overrides other OpenACC 
loop specifiers" }
+  !$acc kernels loop seq worker ! { dg-error "'seq' overrides other OpenACC 
loop specifiers" "TODO" { xfail *-*-* } }
   DO i = 1,10
   ENDDO
   !$acc kernels loop gang worker
@@ -162,7 +162,7 @@ program test
   !$acc kernels loop vector(length:5)
   DO i = 1,10
   ENDDO
-  !$acc kernels loop seq vector ! { dg-error "'seq' overrides other OpenACC 
loop specifiers" }
+  !$acc kernels loop seq vector ! { dg-error "'seq' overrides other OpenACC 
loop specifiers" "TODO" { xfail *-*-* } }
   DO i = 1,10
   ENDDO
   !$acc kernels loop gang vector
@@ -175,7 +175,7 @@ program test
   !$acc kernels loop auto
   DO i = 1,10
   ENDDO
-  !$acc kernels loop seq auto ! { dg-error "'seq' overrides other OpenACC loop 
specifiers" }
+  !$acc kernels loop seq auto ! { dg-error "'seq' overrides other OpenACC loop 
specifiers" "TODO" { xfail *-*-* } }
   DO i = 1,10
   ENDDO
   !$acc kernels loop gang auto ! { dg-error "'auto' conflicts with other 
OpenACC loop specifiers" }
-- 
2.8.1



preprocessor: add_path simplification

2020-07-22 Thread Nathan Sidwell
I noticed add_path was calling strlen more than once on the same string. 
 Let's not do that.


gcc/
* incpath.c (add_path): Avoid multiple strlen calls.

--
Nathan Sidwell
diff --git i/gcc/incpath.c w/gcc/incpath.c
index 8a2bda00f80..8437939bf1e 100644
--- i/gcc/incpath.c
+++ w/gcc/incpath.c
@@ -52,8 +52,8 @@ static void free_path (struct cpp_dir *, int);
 static void merge_include_chains (const char *, cpp_reader *, int);
 static void add_sysroot_to_chain (const char *, int);
 static struct cpp_dir *remove_duplicates (cpp_reader *, struct cpp_dir *,
-	   struct cpp_dir *,
-	   struct cpp_dir *, int);
+	  struct cpp_dir *, struct cpp_dir *,
+	  int);
 
 /* Include chains heads and tails.  */
 static struct cpp_dir *heads[INC_MAX];
@@ -432,6 +432,7 @@ void
 add_path (char *path, incpath_kind chain, int cxx_aware, bool user_supplied_p)
 {
   cpp_dir *p;
+  size_t pathlen = strlen (path);
 
 #if defined (HAVE_DOS_BASED_FILE_SYSTEM)
   /* Remove unnecessary trailing slashes.  On some versions of MS
@@ -439,18 +440,19 @@ add_path (char *path, incpath_kind chain, int cxx_aware, bool user_supplied_p)
  On newer versions, stat() does not recognize a directory that ends
  in a '\\' or '/', unless it is a drive root dir, such as "c:/",
  where it is obligatory.  */
-  int pathlen = strlen (path);
   char* end = path + pathlen - 1;
   /* Preserve the lead '/' or lead "c:/".  */
   char* start = path + (pathlen > 2 && path[1] == ':' ? 3 : 1);
 
   for (; end > start && IS_DIR_SEPARATOR (*end); end--)
 *end = 0;
+  pathlen = end - path;
 #endif
 
   p = XNEW (cpp_dir);
   p->next = NULL;
   p->name = path;
+  p->len = pathlen;
 #ifndef INO_T_EQ
   p->canonical_name = lrealpath (path);
 #endif


c++: mangler simplifications

2020-07-22 Thread Nathan Sidwell
I noticed the mangler's handling of templates could be simplified. We 
know template_info is non-null, which is sufficiently boolean -- no need 
for an explicit bool return.  also some of the internals of 
template_args_equal had crept into find_substitution.  Let's not do that.


gcc/cp/
* mangle.c (decl_is_template_id): Rename to ...
(maybe_template_info): ... here.  Return the template info,
rather than use a pointer.  Adjust all callers.
(find_substitution): Use template_args_equal, rather than
local check.

--
Nathan Sidwell
diff --git i/gcc/cp/mangle.c w/gcc/cp/mangle.c
index 43ff2e84db5..9fd30011288 100644
--- i/gcc/cp/mangle.c
+++ w/gcc/cp/mangle.c
@@ -170,7 +170,7 @@ integer_type_codes[itk_none] =
   '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0'
 };
 
-static int decl_is_template_id (const tree, tree* const);
+static tree maybe_template_info (const tree);
 
 /* Functions for handling substitutions.  */
 
@@ -272,11 +272,10 @@ static tree mangle_special_for_type (const tree, const char *);
   write_number ((NUMBER), /*unsigned_p=*/1, 10)
 
 /* If DECL is a template instance (including the uninstantiated template
-   itself), return nonzero and, if TEMPLATE_INFO is non-NULL, set
-   *TEMPLATE_INFO to its template info.  Otherwise return zero.  */
+   itself), return its TEMPLATE_INFO.  Otherwise return NULL.  */
 
-static int
-decl_is_template_id (const tree decl, tree* const template_info)
+static tree
+maybe_template_info (const tree decl)
 {
   if (TREE_CODE (decl) == TYPE_DECL)
 {
@@ -285,33 +284,20 @@ decl_is_template_id (const tree decl, tree* const template_info)
   const tree type = TREE_TYPE (decl);
 
   if (CLASS_TYPE_P (type) && CLASSTYPE_TEMPLATE_ID_P (type))
-	{
-	  if (template_info != NULL)
-	/* For a templated TYPE_DECL, the template info is hanging
-	   off the type.  */
-	*template_info = TYPE_TEMPLATE_INFO (type);
-	  return 1;
-	}
+	return TYPE_TEMPLATE_INFO (type);
 }
   else
 {
-  /* Check if this is a primary template.  */
+  /* Check if the template is a primary template.  */
   if (DECL_LANG_SPECIFIC (decl) != NULL
 	  && VAR_OR_FUNCTION_DECL_P (decl)
 	  && DECL_TEMPLATE_INFO (decl)
-	  && PRIMARY_TEMPLATE_P (DECL_TI_TEMPLATE (decl))
-	  && TREE_CODE (decl) != TEMPLATE_DECL)
-	{
-	  if (template_info != NULL)
-	/* For most templated decls, the template info is hanging
-	   off the decl.  */
-	*template_info = DECL_TEMPLATE_INFO (decl);
-	  return 1;
-	}
+	  && PRIMARY_TEMPLATE_P (DECL_TI_TEMPLATE (decl)))
+	return DECL_TEMPLATE_INFO (decl);
 }
 
   /* It's not a template id.  */
-  return 0;
+  return NULL_TREE;
 }
 
 /* Produce debugging output of current substitution candidates.  */
@@ -628,9 +614,7 @@ find_substitution (tree node)
 	{
 	  tree args = CLASSTYPE_TI_ARGS (type);
 	  if (TREE_VEC_LENGTH (args) == 3
-		  && (TREE_CODE (TREE_VEC_ELT (args, 0))
-		  == TREE_CODE (char_type_node))
-		  && same_type_p (TREE_VEC_ELT (args, 0), char_type_node)
+		  && template_args_equal (TREE_VEC_ELT (args, 0), char_type_node)
 		  && is_std_substitution_char (TREE_VEC_ELT (args, 1),
 	   SUBID_CHAR_TRAITS)
 		  && is_std_substitution_char (TREE_VEC_ELT (args, 2),
@@ -654,8 +638,7 @@ find_substitution (tree node)
 	 args  > .  */
   tree args = CLASSTYPE_TI_ARGS (type);
   if (TREE_VEC_LENGTH (args) == 2
-	  && TREE_CODE (TREE_VEC_ELT (args, 0)) == TREE_CODE (char_type_node)
-	  && same_type_p (TREE_VEC_ELT (args, 0), char_type_node)
+	  && template_args_equal (TREE_VEC_ELT (args, 0), char_type_node)
 	  && is_std_substitution_char (TREE_VEC_ELT (args, 1),
    SUBID_CHAR_TRAITS))
 	{
@@ -800,7 +783,7 @@ mangle_return_type_p (tree decl)
   return (!DECL_CONSTRUCTOR_P (decl)
 	  && !DECL_DESTRUCTOR_P (decl)
 	  && !DECL_CONV_FN_P (decl)
-	  && decl_is_template_id (decl, NULL));
+	  && maybe_template_info (decl));
 }
 
 /*   		::=  
@@ -827,9 +810,8 @@ write_encoding (const tree decl)
 {
   tree fn_type;
   tree d;
-  bool tmpl = decl_is_template_id (decl, NULL);
 
-  if (tmpl)
+  if (maybe_template_info (decl))
 	{
 	  fn_type = get_mostly_instantiated_function_type (decl);
 	  /* FN_TYPE will not have parameter types for in-charge or
@@ -933,13 +915,12 @@ write_name (tree decl, const int ignore_local_scope)
 	  || (abi_version_at_least (7)
 		  && TREE_CODE (context) == PARM_DECL
 {
-  tree template_info;
   /* Is this a template instance?  */
-  if (decl_is_template_id (decl, _info))
+  if (tree info = maybe_template_info (decl))
 	{
 	  /* Yes: use .  */
-	  write_unscoped_template_name (TI_TEMPLATE (template_info));
-	  write_template_args (TI_ARGS (template_info));
+	  write_unscoped_template_name (TI_TEMPLATE (info));
+	  write_template_args (TI_ARGS (info));
 	}
   else
 	/* Everything else gets an .  */
@@ -1041,8 +1022,6 @@ 

RE: RE: [PATCH PR95696] regrename creates overlapping register allocations for vliw

2020-07-22 Thread Zhongyunde

> -Original Message-
> From: Richard Sandiford [mailto:richard.sandif...@arm.com]
> Sent: Wednesday, July 22, 2020 12:12 AM
> To: Zhongyunde 
> Cc: gcc-patches@gcc.gnu.org; Yangfei (A) 
> Subject: Re: 答复: [PATCH PR95696] regrename creates overlapping
> register allocations for vliw
> 
> Zhongyunde  writes:
> > Thanks for your review.
> >
> > First of all, this is an optimization.
> 
> OK, good.
> 
> >gcc do sms before reload, and here each insn use pseudo-register.
> After reload, they are allocated hard-register, then the regrename pass try
> to adjust the register number with def/use chain created by
> build_def_use.
> >  As now gcc doesn't consider the VLIW bundles, so regrename pass may
> updated a reg which may not really unused, which will bring in invalid
> VLIW bundles.
> >Before the final schedule, we usually recheck the validation of VLIW
> bundles, and reschedule the conflicted insns into two VLIW to make them
> validation to avoid above issue, so this is not a correctness issue.
> >  Certainly, reschedule the conflicted insns into two VLIW will destroy
> the kernel loop's sms schedule result, and usually it will be harmful to the
> performance.
> 
> Yeah.  The reason I was worried about the TI markers being stale is that, in
> general, register allocation can introduce new spills and reloads, can add
> and remove instructions, and can convert instructions into different forms
> (e.g. as a result of register elimination).
> There are then post-reload optimisers that can change the code further.
> All these things could invalidate the VLIW bundling done by the first
> scheduler.
> 
> It sounds like that's not happening in your motivating testcase, and the
> VLIW bundling is still correct (for this loop) by the time that regrename
> runs.  Is that right?

Yes, it is right.

> It's interesting that this is for a testcase using SMS.  One of the 
> traditional
> problems with the GCC implementation of SMS has been ensuring that
> later passes don't mess up the scheduled loop.  So in your testcase, does
> register allocation succeed for the SMS loop without invalidating the
> bundling decisions?

Yes.

> If so, then it's probably better to avoid running regrename on it at all.
> It mostly exists to help the second scheduling pass, but the second
> scheduling pass shouldn't be messing with an SMS loop anyway.  Also,
> although the patch deals with one case in which regrename could disrupt
> the bundling, there are others too.
> 
> So maybe one option would be to make regrename ignore blocks that
> have BB_DISABLE_SCHEDULE set.  (Sorry if that's been discussed and
> discounted
> already.)

ok, according your advice, I make a new patch attached.

> Thanks,
> Richard


PR95696_1.patch
Description: PR95696_1.patch


Re: [PATCH 1/2] Add new RTX instruction class FILLER_INSN

2020-07-22 Thread Andrea Corallo
Richard Biener  writes:

> I wonder if such effect of instructions on the pipeline can be modeled
> in the DFA and thus whether the scheduler could issue (always ready)
> NOPs?

I might be wrong but the DFA model should be reasoning in terms of
executed instructions given an execution path, on the contrary this is
taking in account the 'footprint' of the branches of a program in
memory.  This is what some u-arch is sentive to.

  Andrea


[PATCH] [og10] Fix routine-nohost-1.c testcase

2020-07-22 Thread Kwok Cheung Yeung
The test c-c++-common/goacc/routine-nohost-1.c currently fails because it fails 
to find some tree dump output. The problem is that the relevant tree pass is now 
oaccloops rather than oaccdevlow.


This patch corrects the requested tree dump. I will be committing this one in 
OG10 as 'obvious'.


Kwok
From f921b0988c41ba086e968faf08e93f7a230e55a1 Mon Sep 17 00:00:00 2001
From: Kwok Cheung Yeung 
Date: Fri, 19 Jun 2020 09:34:27 -0700
Subject: [PATCH 4/6] Fix failure in testcase
 c-c++-common/goacc/routine-nohost-1.c

2020-07-21  Kwok Cheung Yeung  

gcc/testsuite/
* c-c++-common/goacc/routine-nohost-1.c: Change tree dump pass to
oaccloops.
---
 gcc/testsuite/c-c++-common/goacc/routine-nohost-1.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/c-c++-common/goacc/routine-nohost-1.c 
b/gcc/testsuite/c-c++-common/goacc/routine-nohost-1.c
index 9baa56c..ebeaadb 100644
--- a/gcc/testsuite/c-c++-common/goacc/routine-nohost-1.c
+++ b/gcc/testsuite/c-c++-common/goacc/routine-nohost-1.c
@@ -1,7 +1,7 @@
 /* Test the nohost clause for OpenACC routine directive.  Exercising different
variants for declaring routines.  */
 
-/* { dg-additional-options "-fdump-tree-oaccdevlow" } */
+/* { dg-additional-options "-fdump-tree-oaccloops" } */
 
 #pragma acc routine nohost
 int THREE(void)
@@ -25,4 +25,4 @@ float ADD(float x, float y)
   return x + y;
 }
 
-/* { dg-final { scan-tree-dump-times "Discarding function" 3 "oaccdevlow" } } 
*/
+/* { dg-final { scan-tree-dump-times "Discarding function" 3 "oaccloops" } } */
-- 
2.8.1



Re: [PATCH] Add TARGET_LOWER_LOCAL_DECL_ALIGNMENT [PR95237]

2020-07-22 Thread H.J. Lu via Gcc-patches
On Wed, Jul 22, 2020 at 7:25 AM Dimitar Dimitrov  wrote:
>
> On сряда, 22 юли 2020 г. 2:04:35 EEST Sunil Pandey via Gcc-patches wrote:
> > On Tue, Jul 21, 2020 at 12:50 AM Richard Biener
> >
> >  wrote:
> > > On Tue, Jul 21, 2020 at 7:16 AM Sunil Pandey  wrote:
> > > > On Mon, Jul 20, 2020 at 5:06 AM Richard Biener
> > > >
> > > >  wrote:
> > > > > On Sat, Jul 18, 2020 at 7:57 AM Sunil Pandey 
> wrote:
> > > > > > On Fri, Jul 17, 2020 at 1:22 AM Richard Biener
> > > > > >
> > > > > >  wrote:
> > > > > > > On Fri, Jul 17, 2020 at 7:15 AM Sunil Pandey 
> wrote:
> > > > > > > > Any comment on revised patch? At least,  in finish_decl, decl
> > > > > > > > global attributes are populated.> > > > >
> > > > > > > +static void
> > > > > > > +ix86_lower_local_decl_alignment (tree decl)
> > > > > > > +{
> > > > > > > +  unsigned new_align = LOCAL_DECL_ALIGNMENT (decl);
> > > > > > >
> > > > > > > please use the macro-expanded call here since we want to amend
> > > > > > > ix86_local_alignment to _not_ return a lower alignment when
> > > > > > > called as LOCAL_DECL_ALIGNMENT (by adding a new parameter
> > > > > > > to ix86_local_alignment).  Can you also amend the patch in this
> > > > > > > way?
> > > > > > >
> > > > > > > +  if (new_align < DECL_ALIGN (decl))
> > > > > > > +SET_DECL_ALIGN (decl, new_align);
> > > > > > >
> > > > > > > diff --git a/gcc/c/c-decl.c b/gcc/c/c-decl.c
> > > > > > > index 81bd2ee94f0..1ae99e30ed1 100644
> > > > > > > --- a/gcc/c/c-decl.c
> > > > > > > +++ b/gcc/c/c-decl.c
> > > > > > > @@ -5601,6 +5601,8 @@ finish_decl (tree decl, location_t init_loc,
> > > > > > > tree init,> > > > >
> > > > > > >  }
> > > > > > >
> > > > > > >invoke_plugin_callbacks (PLUGIN_FINISH_DECL, decl);
> > > > > > >
> > > > > > > +  /* Lower local decl alignment.  */
> > > > > > > +  lower_decl_alignment (decl);
> > > > > > >
> > > > > > >  }
> > > > > > >
> > > > > > > should come before plugin hook invocation, likewise for the
> > > > > > > cp_finish_decl case.
> > > > > > >
> > > > > > > +/* Lower DECL alignment.  */
> > > > > > > +
> > > > > > > +void
> > > > > > > +lower_decl_alignment (tree decl)
> > > > > > > +{
> > > > > > > +  if (VAR_P (decl)
> > > > > > > +  && !is_global_var (decl)
> > > > > > > +  && !DECL_HARD_REGISTER (decl))
> > > > > > > +targetm.lower_local_decl_alignment (decl);
> > > > > > > +}
> > > > > > >
> > > > > > > please avoid this function, it's name sounds too generic and it's
> > > > > > > not worth
> > > > > > > adding a public API for two calls.
> > > > > > >
> > > > > > > Alltogether this should avoid the x86 issue leaving left-overs
> > > > > > > (your identified inliner case) as missed optimization [for the
> > > > > > > linux kernel which appearantly decided that
> > > > > > > -mpreferred-stack-boundary=2 is a good ABI to use].
> > > > > > >
> > > > > > > Richard.
> > > > > >
> > > > > > Revised patch attached.
> > > > >
> > > > > @@ -16776,7 +16783,7 @@ ix86_data_alignment (tree type, unsigned int
> > > > > align, bool opt)
> > > > >
> > > > >  unsigned int
> > > > >  ix86_local_alignment (tree exp, machine_mode mode,
> > > > >
> > > > > - unsigned int align)
> > > > > + unsigned int align, bool setalign)
> > > > >
> > > > >  {
> > > > >
> > > > >tree type, decl;
> > > > >
> > > > > @@ -16801,6 +16808,10 @@ ix86_local_alignment (tree exp, machine_mode
> > > > > mode,
> > > > >
> > > > >&& (!decl || !DECL_USER_ALIGN (decl)))
> > > > >
> > > > >  align = 32;
> > > > >
> > > > > +  /* Lower decl alignment.  */
> > > > > +  if (setalign && align < DECL_ALIGN (decl))
> > > > > +SET_DECL_ALIGN (decl, align);
> > > > > +
> > > > >
> > > > >/* If TYPE is NULL, we are allocating a stack slot for caller-save
> > > > >
> > > > >   register in MODE.  We will return the largest alignment of XF
> > > > >   and DF.  */
> > > > >
> > > > > sorry for not being clear - the parameter should indicate whether an
> > > > > alignment lower
> > > > > than natural alignment is OK to return thus sth like
> > > > >
> > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > > > index 31757b044c8..19703cbceb9 100644
> > > > > --- a/gcc/config/i386/i386.c
> > > > > +++ b/gcc/config/i386/i386.c
> > > > > @@ -16641,7 +16641,7 @@ ix86_data_alignment (tree type, unsigned int
> > > > > align, bool opt)
> > > > >
> > > > >  unsigned int
> > > > >  ix86_local_alignment (tree exp, machine_mode mode,
> > > > >
> > > > > - unsigned int align)
> > > > > + unsigned int align, bool may_lower)
> > > > >
> > > > >  {
> > > > >
> > > > >tree type, decl;
> > > > >
> > > > > @@ -16658,7 +16658,8 @@ ix86_local_alignment (tree exp, machine_mode
> > > > > mode,
> > > > >
> > > > >/* Don't do dynamic stack realignment for long long objects with
> > > > >
> > > > >   -mpreferred-stack-boundary=2.  */
> > > > >
> > > > > -  if (!TARGET_64BIT
> > > > > +  if 

Re: [PATCH] Add TARGET_LOWER_LOCAL_DECL_ALIGNMENT [PR95237]

2020-07-22 Thread Dimitar Dimitrov
On сряда, 22 юли 2020 г. 2:04:35 EEST Sunil Pandey via Gcc-patches wrote:
> On Tue, Jul 21, 2020 at 12:50 AM Richard Biener
> 
>  wrote:
> > On Tue, Jul 21, 2020 at 7:16 AM Sunil Pandey  wrote:
> > > On Mon, Jul 20, 2020 at 5:06 AM Richard Biener
> > > 
> > >  wrote:
> > > > On Sat, Jul 18, 2020 at 7:57 AM Sunil Pandey  
wrote:
> > > > > On Fri, Jul 17, 2020 at 1:22 AM Richard Biener
> > > > > 
> > > > >  wrote:
> > > > > > On Fri, Jul 17, 2020 at 7:15 AM Sunil Pandey  
wrote:
> > > > > > > Any comment on revised patch? At least,  in finish_decl, decl
> > > > > > > global attributes are populated.> > > > > 
> > > > > > +static void
> > > > > > +ix86_lower_local_decl_alignment (tree decl)
> > > > > > +{
> > > > > > +  unsigned new_align = LOCAL_DECL_ALIGNMENT (decl);
> > > > > > 
> > > > > > please use the macro-expanded call here since we want to amend
> > > > > > ix86_local_alignment to _not_ return a lower alignment when
> > > > > > called as LOCAL_DECL_ALIGNMENT (by adding a new parameter
> > > > > > to ix86_local_alignment).  Can you also amend the patch in this
> > > > > > way?
> > > > > > 
> > > > > > +  if (new_align < DECL_ALIGN (decl))
> > > > > > +SET_DECL_ALIGN (decl, new_align);
> > > > > > 
> > > > > > diff --git a/gcc/c/c-decl.c b/gcc/c/c-decl.c
> > > > > > index 81bd2ee94f0..1ae99e30ed1 100644
> > > > > > --- a/gcc/c/c-decl.c
> > > > > > +++ b/gcc/c/c-decl.c
> > > > > > @@ -5601,6 +5601,8 @@ finish_decl (tree decl, location_t init_loc,
> > > > > > tree init,> > > > > 
> > > > > >  }
> > > > > >
> > > > > >invoke_plugin_callbacks (PLUGIN_FINISH_DECL, decl);
> > > > > > 
> > > > > > +  /* Lower local decl alignment.  */
> > > > > > +  lower_decl_alignment (decl);
> > > > > > 
> > > > > >  }
> > > > > > 
> > > > > > should come before plugin hook invocation, likewise for the
> > > > > > cp_finish_decl case.
> > > > > > 
> > > > > > +/* Lower DECL alignment.  */
> > > > > > +
> > > > > > +void
> > > > > > +lower_decl_alignment (tree decl)
> > > > > > +{
> > > > > > +  if (VAR_P (decl)
> > > > > > +  && !is_global_var (decl)
> > > > > > +  && !DECL_HARD_REGISTER (decl))
> > > > > > +targetm.lower_local_decl_alignment (decl);
> > > > > > +}
> > > > > > 
> > > > > > please avoid this function, it's name sounds too generic and it's
> > > > > > not worth
> > > > > > adding a public API for two calls.
> > > > > > 
> > > > > > Alltogether this should avoid the x86 issue leaving left-overs
> > > > > > (your identified inliner case) as missed optimization [for the
> > > > > > linux kernel which appearantly decided that
> > > > > > -mpreferred-stack-boundary=2 is a good ABI to use].
> > > > > > 
> > > > > > Richard.
> > > > > 
> > > > > Revised patch attached.
> > > > 
> > > > @@ -16776,7 +16783,7 @@ ix86_data_alignment (tree type, unsigned int
> > > > align, bool opt)
> > > > 
> > > >  unsigned int
> > > >  ix86_local_alignment (tree exp, machine_mode mode,
> > > > 
> > > > - unsigned int align)
> > > > + unsigned int align, bool setalign)
> > > > 
> > > >  {
> > > >  
> > > >tree type, decl;
> > > > 
> > > > @@ -16801,6 +16808,10 @@ ix86_local_alignment (tree exp, machine_mode
> > > > mode,
> > > > 
> > > >&& (!decl || !DECL_USER_ALIGN (decl)))
> > > >  
> > > >  align = 32;
> > > > 
> > > > +  /* Lower decl alignment.  */
> > > > +  if (setalign && align < DECL_ALIGN (decl))
> > > > +SET_DECL_ALIGN (decl, align);
> > > > +
> > > > 
> > > >/* If TYPE is NULL, we are allocating a stack slot for caller-save
> > > >
> > > >   register in MODE.  We will return the largest alignment of XF
> > > >   and DF.  */
> > > > 
> > > > sorry for not being clear - the parameter should indicate whether an
> > > > alignment lower
> > > > than natural alignment is OK to return thus sth like
> > > > 
> > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > > index 31757b044c8..19703cbceb9 100644
> > > > --- a/gcc/config/i386/i386.c
> > > > +++ b/gcc/config/i386/i386.c
> > > > @@ -16641,7 +16641,7 @@ ix86_data_alignment (tree type, unsigned int
> > > > align, bool opt)
> > > > 
> > > >  unsigned int
> > > >  ix86_local_alignment (tree exp, machine_mode mode,
> > > > 
> > > > - unsigned int align)
> > > > + unsigned int align, bool may_lower)
> > > > 
> > > >  {
> > > >  
> > > >tree type, decl;
> > > > 
> > > > @@ -16658,7 +16658,8 @@ ix86_local_alignment (tree exp, machine_mode
> > > > mode,
> > > > 
> > > >/* Don't do dynamic stack realignment for long long objects with
> > > >
> > > >   -mpreferred-stack-boundary=2.  */
> > > > 
> > > > -  if (!TARGET_64BIT
> > > > +  if (may_lower
> > > > +  && !TARGET_64BIT
> > > > 
> > > >&& align == 64
> > > >&& ix86_preferred_stack_boundary < 64
> > > >&& (mode == DImode || (type && TYPE_MODE (type) == DImode))
> > > > 
> > > > I also believe that spill_slot_alignment () 

[PATCH] [og10] Fix goacc/loop-processing-1.c testcase

2020-07-22 Thread Kwok Cheung Yeung
gcc.dg/goacc/loop-processing-1.c fails mainly because the dg-final directive at 
the end has been incorrectly split into two lines, which breaks it completely. 
The pass that emits the tested tree output is now oaccloops, not oaccdevlow.


'.UNIQUE (OACC_HEAD_MARK, 0, 1, 36)' is also changed to '.UNIQUE 
(OACC_HEAD_MARK, 0, 1, 68)'. This is due to OLF_DIM_BASE being raised from 5 to 
6 in the patch 'Various OpenACC reduction enhancements - ME and nvptx changes' 
(commit 59399e6736d18ded9ce91dbd3ca44c1f8280d452).


This patch fixes these issues. Okay for OG10?

Kwok
From 22e91315f3ce7c486017c6b9245dc1ea2d6bdede Mon Sep 17 00:00:00 2001
From: Kwok Cheung Yeung 
Date: Fri, 19 Jun 2020 06:58:40 -0700
Subject: [PATCH 3/6] Fix broken testcase gcc.dg/goacc/loop-processing-1.c

2020-07-21  Kwok Cheung Yeung  

gcc/testsuite/
* gcc.dg/goacc/loop-processing-1.c: Remove erroneous line-break.
Specify correct tree pass for output.  Fix expected constant values.
---
 gcc/testsuite/gcc.dg/goacc/loop-processing-1.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/goacc/loop-processing-1.c 
b/gcc/testsuite/gcc.dg/goacc/loop-processing-1.c
index 296b61d..9a64f1a 100644
--- a/gcc/testsuite/gcc.dg/goacc/loop-processing-1.c
+++ b/gcc/testsuite/gcc.dg/goacc/loop-processing-1.c
@@ -1,5 +1,5 @@
 /* Make sure that OpenACC loop processing happens.  */
-/* { dg-additional-options "-O2 -fdump-tree-oaccdevlow" } */
+/* { dg-additional-options "-O2 -fdump-tree-oaccloops" } */
 
 extern int place ();
 
@@ -15,5 +15,4 @@ void vector_1 (int *ary, int size)
   }
 }
 
-/* { dg-final { scan-tree-dump {
-OpenACC loops.*Loop 0\(0\).*Loop [0-9]{2}\(1\).*\.data_dep\.[0-9_]+ = \.UNIQUE 
\(OACC_HEAD_MARK, 0, 1, 36\);.*Head-0:.*\.data_dep\.[0-9_]+ = \.UNIQUE 
\(OACC_HEAD_MARK, 0, 1, 36\);.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_FORK, 
\.data_dep\.[0-9_]+, 0\);.*Tail-0:.*\.data_dep\.[0-9_]+ = \.UNIQUE 
\(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = \.UNIQUE 
\(OACC_JOIN, \.data_dep\.[0-9_]+, 0\);.*Loop 6\(6\).*\.data_dep\.[0-9_]+ = 
\.UNIQUE \(OACC_HEAD_MARK, 0, 2, 6\);.*Head-0:.*\.data_dep\.[0-9_]+ = \.UNIQUE 
\(OACC_HEAD_MARK, 0, 2, 6\);.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_FORK, 
\.data_dep\.[0-9_]+, 1\);.*Head-1:.*\.data_dep\.[0-9_]+ = \.UNIQUE 
\(OACC_HEAD_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = \.UNIQUE 
\(OACC_FORK, \.data_dep\.[0-9_]+, 2\);.*Tail-1:.*\.data_dep\.[0-9_]+ = \.UNIQUE 
\(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 2\);.*\.data_dep\.[0-9_]+ = \.UNIQUE 
\(OACC_JOIN, \.data_dep\.[0-9_]+, 2\);.*Tail-0:.*\.data_dep\.[0-9_]+ = \.UNIQUE 
\(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = \.UNIQUE 
\(OACC_JOIN, \.data_dep\.[0-9_]+, 1\);} "oaccdevlow" } } */
+/* { dg-final { scan-tree-dump {OpenACC loops.*Loop 0\(0\).*Loop 
[0-9]{2}\(1\).*.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_HEAD_MARK, 0, 1, 
68\);.*Head-0:.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_HEAD_MARK, 0, 1, 
68\);.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 
0\);.*Tail-0:.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_TAIL_MARK, 
\.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_JOIN, 
\.data_dep\.[0-9_]+, 0\);.*Loop 6\(6\).*\.data_dep\.[0-9_]+ = \.UNIQUE 
\(OACC_HEAD_MARK, 0, 2, 6\);.*Head-0:.*\.data_dep\.[0-9_]+ = \.UNIQUE 
\(OACC_HEAD_MARK, 0, 2, 6\);.*\.data_dep\.[0-9_]+ = \.UNIQUE \(OACC_FORK, 
\.data_dep\.[0-9_]+, 1\);.*Head-1:.*\.data_dep\.[0-9_]+ = \.UNIQUE 
\(OACC_HEAD_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = \.UNIQUE 
\(OACC_FORK, \.data_dep\.[0-9_]+, 2\);.*Tail-1:.*\.data_dep\.[0-9_]+ = \.UNIQUE 
\(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 2\);.*\.data_dep\.[0-9_]+ = \.UNIQUE 
\(OACC_JOIN, \.data_dep\.[0-9_]+, 2\);.*Tail-0:.*\.data_dep\.[0-9_]+ = \.UNIQUE 
\(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = \.UNIQUE 
\(OACC_JOIN, \.data_dep\.[0-9_]+, 1\);} "oaccloops" } } */
\ No newline at end of file
-- 
2.8.1



[committed] libstdc++: Constrain reverse_iterator and move_iterator conversions [LWG 3435]

2020-07-22 Thread Jonathan Wakely via Gcc-patches
libstdc++-v3/ChangeLog:

* include/bits/stl_iterator.h (reverse_iterator): Constrain
converting constructor and converting assignment operator.
Access source iterator's data member directly instead of
calling base().
(move_iterator): Likewise.
* testsuite/24_iterators/move_iterator/dr3435.cc: New test.
* testsuite/24_iterators/reverse_iterator/dr3435.cc: New test.

Tested powerpc64le-linux, committed to trunk.


commit a5a8a4e61565a2a66391e29eb80813c581b7dc52
Author: Jonathan Wakely 
Date:   Wed Jul 22 13:25:11 2020

libstdc++: Constrain reverse_iterator and move_iterator conversions [LWG 
3435]

libstdc++-v3/ChangeLog:

* include/bits/stl_iterator.h (reverse_iterator): Constrain
converting constructor and converting assignment operator.
Access source iterator's data member directly instead of
calling base().
(move_iterator): Likewise.
* testsuite/24_iterators/move_iterator/dr3435.cc: New test.
* testsuite/24_iterators/reverse_iterator/dr3435.cc: New test.

diff --git a/libstdc++-v3/include/bits/stl_iterator.h 
b/libstdc++-v3/include/bits/stl_iterator.h
index 6d2d19eb068..60bb40a659f 100644
--- a/libstdc++-v3/include/bits/stl_iterator.h
+++ b/libstdc++-v3/include/bits/stl_iterator.h
@@ -129,6 +129,19 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
  typename iterator_traits<_Iterator>::pointer,
   typename iterator_traits<_Iterator>::reference>
 {
+#if __cplusplus >= 201103L
+  template
+   friend class reverse_iterator;
+#endif
+
+#if __cpp_lib_concepts
+  // _GLIBCXX_RESOLVE_LIB_DEFECTS
+  // 3435. three_way_comparable_with, [...]>
+  template
+   static constexpr bool __convertible = !is_same_v<_Iter, _Iterator>
+   && convertible_to;
+#endif
+
 protected:
   _Iterator current;
 
@@ -182,9 +195,27 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
*  underlying %iterator can be converted to the type of @c current.
   */
   template
+#if __cpp_lib_concepts
+   requires __convertible<_Iter>
+#endif
_GLIBCXX17_CONSTEXPR
 reverse_iterator(const reverse_iterator<_Iter>& __x)
-   : current(__x.base()) { }
+   : current(__x.current) { }
+
+#if __cplusplus >= 201103L
+  template
+#if __cpp_lib_concepts
+   requires __convertible<_Iter>
+ && assignable_from<_Iterator&, const _Iter&>
+#endif
+   _GLIBCXX17_CONSTEXPR
+   reverse_iterator&
+   operator=(const reverse_iterator<_Iter>& __x)
+   {
+ current = __x.current;
+ return *this;
+   }
+#endif
 
   /**
*  @return  @c current, the %iterator used for underlying work.
@@ -1270,6 +1301,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   using __base_ref = typename __traits_type::reference;
 #endif
 
+  template
+   friend class move_iterator;
+
+#if __cpp_lib_concepts
+  // _GLIBCXX_RESOLVE_LIB_DEFECTS
+  // 3435. three_way_comparable_with, [...]>
+  template
+   static constexpr bool __convertible = !is_same_v<_Iter2, _Iterator>
+   && convertible_to;
+#endif
+
 public:
   using iterator_type = _Iterator;
 
@@ -1303,15 +1345,22 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   : _M_current(std::move(__i)) { }
 
   template
+#if __cpp_lib_concepts
+   requires __convertible<_Iter>
+#endif
_GLIBCXX17_CONSTEXPR
move_iterator(const move_iterator<_Iter>& __i)
-   : _M_current(__i.base()) { }
+   : _M_current(__i._M_current) { }
 
   template
+#if __cpp_lib_concepts
+   requires __convertible<_Iter>
+ && assignable_from<_Iterator&, const _Iter&>
+#endif
_GLIBCXX17_CONSTEXPR
move_iterator& operator=(const move_iterator<_Iter>& __i)
{
- _M_current = __i.base();
+ _M_current = __i._M_current;
  return *this;
}
 
diff --git a/libstdc++-v3/testsuite/24_iterators/move_iterator/dr3435.cc 
b/libstdc++-v3/testsuite/24_iterators/move_iterator/dr3435.cc
new file mode 100644
index 000..170d4977df1
--- /dev/null
+++ b/libstdc++-v3/testsuite/24_iterators/move_iterator/dr3435.cc
@@ -0,0 +1,37 @@
+// Copyright (C) 2020 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// 

Re: [PATCH] c++: Add __builtin_bit_cast to implement std::bit_cast [PR93121]

2020-07-22 Thread Paul Koning via Gcc-patches



> On Jul 18, 2020, at 2:50 PM, Jakub Jelinek via Gcc-patches 
>  wrote:
> 
> Hi!
> 
> The following patch adds __builtin_bit_cast builtin, similarly to
> clang or MSVC which implement std::bit_cast using such an builtin too.
> It checks the various std::bit_cast requirements, when not constexpr
> evaluated acts pretty much like VIEW_CONVERT_EXPR of the source argument
> to the destination type and the hardest part is obviously the constexpr
> evaluation.  I couldn't use the middle-end native_encode_initializer,
> because it needs to handle the C++ CONSTRUCTOR_NO_CLEARING vs. negation of
> that, value initialization of missing members if there are any etc., and
> needs to handle bitfields even if they don't have an integral representative
> (I've left out PDP11 handling of those, couldn't figure out how exactly are
> bitfields laid out there).

It seems to be spelled out in builtins.c function c_readstr.

paul




Re: [PATCH 2/2] Aarch64: Add branch diluter pass

2020-07-22 Thread Andrea Corallo
Hi Andrew,

thanks for reviewing I'll work on your comments.  Just replying to the
high level questions.

Andrew Pinski  writes:

> On Wed, Jul 22, 2020 at 3:10 AM Andrea Corallo  wrote:
>>
>> Hi all,
>>
>> this second patch implements the AArch64 specific back-end pass
>> 'branch-dilution' controllable by the followings command line options:
>>
>> -mbranch-dilution
>>
>> --param=aarch64-branch-dilution-granularity={num}
>>
>> --param=aarch64-branch-dilution-max-branches={num}
>>
>> Some cores known to be able to benefit from this pass have been given
>> default tuning values for their granularity and max-branches.  Each
>> affected core has a very specific granule size and associated max-branch
>> limit.  This is a microarchitecture specific optimization.  Typical
>> usage should be -mbranch-dilution with a specified -mcpu.  Cores with a
>> granularity tuned to 0 will be ignored. Options are provided for
>> experimentation.
>
> Can you give a simple example of what this patch does?

Sure, this pass simply moves a sliding window over the insns trying to
make sure that we never have more then 'max_branch' branches for every
'granule_size' insns.

If too many branches are detected nops are added where considered less
armful to correct that.

There are obviously many scenarios where the compiler can generate a
branch dense pieces of code but say we have the equivalent of:


.L389:
bl  foo
b   .L43
.L388:
bl  foo
b   .L42
.L387:
bl  foo
b   .L41
.L386:
bl  foo
b   .L40


Assuming granule size 4 and max branches 2 this will be transformed in
the equivalent of:


.L389:
bl  foo
b   .L43
nop
nop
.L388:
bl  foo
b   .L42
nop
nop
.L387:
bl  foo
b   .L41
nop
nop
.L386:
bl  foo
b   .L40
nop
nop


> Also your testcases seem too sensitive to other optimizations which
> could happen.  E.g. the call to "branch (i)" could be pulled out of
> the switch statement.  Or even the "*i += N;" could be moved to one
> Basic block and the switch becomes just one if statement.
>
>> Observed performance improvements on Neoverse N1 SPEC CPU 2006 where
>> up to ~+3% (xalancbmk) and ~+1.5% (sjeng).  Average code size increase
>> for all the testsuite proved to be ~0.4%.
>
> Also does this improve any non-SPEC benchmarks or has it only been
> benchmarked with SPEC?

So far I tried it only on SPEC 2006.  The transformation is not
benchmark specific tho, other code may benefit from it.

Thanks

  Andrea


[PATCH] [og10] Fix goacc/routine-4-extern.c test

2020-07-22 Thread Kwok Cheung Yeung
This test fails due to expected warnings and errors not being emitted by GCC. 
The problem is that the 'gang reduction on an orphan loop' error is caught very 
early on in the frontends (c/c-typeck.c and cp/semantics.c), so the compiler 
aborts before getting to the tree passes that emit the missing warnings/errors.


This error handling was introduced by the patch 'Various OpenACC reduction 
enhancements - FE changes' (commit 6b3e1f7f05cd360bbd356b3f78511aa2ec3f40c3), 
but the later patch to fix affected testcases 'Various OpenACC reduction 
enhancements - test cases' (commit 6a0b5806b24bfdefe0b0f3ccbcc51299e5195dca) did 
not include a fix for routine-4-extern.c.


This patch removes the now outdated dg-error and dg-warnings from the test. Okay 
for OG10?


Kwok
From 5774f048563df311f2a35a654b8c2d7b1af9f2da Mon Sep 17 00:00:00 2001
From: Kwok Cheung Yeung 
Date: Thu, 18 Jun 2020 14:31:17 -0700
Subject: [PATCH 2/6] Fix failures in c-c++-common/goacc/routine-4-extern.c
 testcase

2020-07-21  Kwok Cheung Yeung  

gcc/testsuite/
* c-c++-common/goacc/routine-4-extern.c (seq, vector, worker): Remove
obsolete expected warnings and errors.
---
 gcc/testsuite/c-c++-common/goacc/routine-4-extern.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/gcc/testsuite/c-c++-common/goacc/routine-4-extern.c 
b/gcc/testsuite/c-c++-common/goacc/routine-4-extern.c
index c23ddcf..5f8372c 100644
--- a/gcc/testsuite/c-c++-common/goacc/routine-4-extern.c
+++ b/gcc/testsuite/c-c++-common/goacc/routine-4-extern.c
@@ -21,14 +21,14 @@ void vector (void);
 #pragma acc routine seq
 void seq (void)
 {
-  extern_gang ();  /* { dg-error "routine call uses" } */
-  extern_worker ();  /* { dg-error "routine call uses" } */
-  extern_vector ();  /* { dg-error "routine call uses" } */
+  extern_gang ();
+  extern_worker ();
+  extern_vector ();
   extern_seq ();
 
   int red;
 
-#pragma acc loop reduction (+:red) // { dg-warning "insufficient partitioning" 
}
+#pragma acc loop reduction (+:red)
   for (int i = 0; i < 10; i++)
 red ++;
 
@@ -36,19 +36,19 @@ void seq (void)
   for (int i = 0; i < 10; i++)
 red ++;
 
-#pragma acc loop worker reduction (+:red) // { dg-error "disallowed by 
containing routine" }
+#pragma acc loop worker reduction (+:red)
   for (int i = 0; i < 10; i++)
 red ++;
 
-#pragma acc loop vector reduction (+:red) // { dg-error "disallowed by 
containing routine" }
+#pragma acc loop vector reduction (+:red)
   for (int i = 0; i < 10; i++)
 red ++;
 }
 
 void vector (void)
 {
-  extern_gang ();  /* { dg-error "routine call uses" } */
-  extern_worker ();  /* { dg-error "routine call uses" } */
+  extern_gang ();
+  extern_worker ();
   extern_vector ();
   extern_seq ();
 
@@ -62,7 +62,7 @@ void vector (void)
   for (int i = 0; i < 10; i++)
 red ++;
 
-#pragma acc loop worker reduction (+:red) // { dg-error "disallowed by 
containing routine" }
+#pragma acc loop worker reduction (+:red)
   for (int i = 0; i < 10; i++)
 red ++;
 
@@ -73,7 +73,7 @@ void vector (void)
 
 void worker (void)
 {
-  extern_gang ();  /* { dg-error "routine call uses" } */
+  extern_gang ();
   extern_worker ();
   extern_vector ();
   extern_seq ();
-- 
2.8.1



Re: [PATCH 1/2] Add new RTX instruction class FILLER_INSN

2020-07-22 Thread Richard Earnshaw (lists)
On 22/07/2020 13:24, Richard Biener via Gcc-patches wrote:
> On Wed, Jul 22, 2020 at 12:03 PM Andrea Corallo  
> wrote:
>>
>> Hi all,
>>
>> I'd like to submit the following two patches implementing a new AArch64
>> specific back-end pass that helps optimize branch-dense code, which can
>> be a bottleneck for performance on some Arm cores.  This is achieved by
>> padding out the branch-dense sections of the instruction stream with
>> nops.
>>
>> The original patch was already posted some time ago:
>>
>> https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg200721.html
>>
>> This follows up splitting as suggested in two patches, rebasing on
>> master and implementing the suggestions of the first code review.
>>
>> This first patch implements the addition of a new RTX instruction class
>> FILLER_INSN, which has been white listed to allow placement of NOPs
>> outside of a basic block.  This is to allow padding after unconditional
>> branches.  This is favorable so that any performance gained from
>> diluting branches is not paid straight back via excessive eating of
>> nops.
>>
>> It was deemed that a new RTX class was less invasive than modifying
>> behavior in regards to standard UNSPEC nops.
>>
>> 1/2 is requirement for 2/2.  Please see this the cover letter of this last
>> for more details on the pass itself.
> 
> I wonder if such effect of instructions on the pipeline can be modeled
> in the DFA and thus whether the scheduler could issue (always ready)
> NOPs?
> 
> I also wonder whether such optimization is better suited for the assembler
> which should know instruction lengths and alignment in a more precise
> way and also would know whether extra nops make immediates too large
> for pc relative things like short branches or section anchor accesses
> (or whatever else)?

No, the assembler should never spontaneously insert instructions.  That
breaks the branch range calculations that the compiler relies upon.

R.

> 
> Richard.
> 
>> Regards
>>
>>   Andrea
>>
>> gcc/ChangeLog
>>
>> 2020-07-17  Andrea Corallo  
>> Carey Williams  
>>
>> * cfgbuild.c (inside_basic_block_p): Handle FILLER_INSN.
>> * cfgrtl.c (rtl_verify_bb_layout): Whitelist FILLER_INSN outside
>> basic blocks.
>> * coretypes.h: New rtx class.
>> * emit-rtl.c (emit_filler_after): New function.
>> * rtl.def (FILLER_INSN): New rtl define.
>> * rtl.h (rtx_filler_insn): Define new structure.
>> (FILLER_INSN_P): New macro.
>> (is_a_helper ::test): New test helper for
>> rtx_filler_insn.
>> (emit_filler_after): New extern.
>> * target-insns.def: Add target insn definition.



Re: [PATCH] c++: decl_constant_value and unsharing [PR96197]

2020-07-22 Thread Patrick Palka via Gcc-patches
On Wed, 22 Jul 2020, Richard Biener wrote:

> On Tue, Jul 21, 2020 at 9:08 PM Patrick Palka via Gcc-patches
>  wrote:
> >
> > In the testcase from the PR we are seeing excessive memory use (> 5GB)
> > during constexpr evaluation, almost all of which is due to the call to
> > decl_constant_value in the VAR_DECL/CONST_DECL branch of
> > cxx_eval_constant_expression.  We reach here every time we evaluate an
> > ARRAY_REF of a constexpr VAR_DECL, which in this testcase is quite
> > often, and from there decl_constant_value makes an unshared copy of the
> > VAR_DECL's initializer, even though the unsharing is not needed at this
> > call site (because it is up to callers of cxx_eval_constant_expression
> > to unshare).
> >
> > To fix this, this patch moves the responsibility of unsharing the result
> > of decl_constant_value, decl_really_constant_value and
> > scalar_constant_value from the callee to the caller.
> >
> > Fortunately there's only six calls to these functions, two of which are
> > from cxx_eval_constant_expression where the unsharing is undesirable.
> > And in unify there is one call, to scalar_constant_value, that looks
> > like:
> >
> >case CONST_DECL:
> >  if (DECL_TEMPLATE_PARM_P (parm))
> >return ...;
> > >if (arg != scalar_constant_value (parm))
> >return ...;
> >
> > where we are suspiciously testing for pointer equality despite
> > scalar_constant_value's unsharing behavior.  This line seems to be dead
> > code however, so this patch replaces it with an appropriate gcc_assert.
> > Finally, this patch adds an explicit call to unshare_expr to the
> > remaining three callers.
> >
> > Now that the the calls to decl_constant_value and
> > decl_really_constant_value from cxx_eval_constant_expression no longer
> > unshare their result, memory use during constexpr evaluation for the
> > testcase in the PR falls from 5GB to 15MB according to -ftime-report.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu, and also tested on
> > cmcstl2 and a number of other libraries.  Does this look OK to commit?
> 
> Can you add the PRs testcase?  Thanks for tracking this down! (but I can't
> approve the patch)
> 
> Richard.

Here's a patch with a reduced reproducer that consumes >6GB memory
during constexpr evaluation without the patch and just a few MB with:

-- >8 --

Subject: [PATCH] c++: decl_constant_value and unsharing [PR96197]

In the testcase from the PR we are seeing excessive memory use (> 5GB)
during constexpr evaluation, almost all of which is due to the call to
decl_constant_value in the VAR_DECL/CONST_DECL branch of
cxx_eval_constant_expression.  We reach here every time we evaluate an
ARRAY_REF of a constexpr VAR_DECL, which in this testcase is quite
often, and from there decl_constant_value makes an unshared copy of the
VAR_DECL's initializer, even though the unsharing is not needed at this
call site (because callers of cxx_eval_constant_expression already
unshare its result when necessary).

To fix this excessive unsharing, this patch moves the responsibility of
unsharing the result of decl_constant_value, decl_really_constant_value
and scalar_constant_value from the callee to the caller.

Fortunately there's just six calls to these functions, two of which are
from cxx_eval_constant_expression where the unsharing is undesirable.
And in unify there is one call, to scalar_constant_value, that looks
like:

   case CONST_DECL:
 if (DECL_TEMPLATE_PARM_P (parm))
   return ...;
>if (arg != scalar_constant_value (parm))
   return ...;

where we are suspiciously testing for pointer equality despite
scalar_constant_value's unsharing behavior.  This line seems to be dead
code however, so this patch replaces it with an appropriate gcc_assert.
Finally, this patch adds an explicit call to unshare_expr to each of the
three remaining callers.

Now that the the calls to decl_constant_value and
decl_really_constant_value from cxx_eval_constant_expression no longer
unshare their result, memory use during constexpr evaluation for the
testcase from the PR falls from ~5GB to 15MB according to -ftime-report.

Bootstrapped and regtested on x86_64-pc-linux-gnu, and also tested on
cmcstl2 and a number of other libraries.  Does this look OK to commit?

gcc/cp/ChangeLog:

PR c++/96197
* cp-gimplify.c (cp_fold_maybe_rvalue): Call unshare_expr on the
result of decl_constant_value.
* cvt.c: Include gimplify.h.
(ocp_convert): Call unshare_expr on the result of
scalar_constant_value.
* init.c (constant_value_1): Don't call unshare_expr here,
so that callers can choose whether to unshare.
* pt.c (tsubst_copy): Call unshare_expr on the result of
scalar_constant_value.
(unify) : Assert DECL_TEMPLATE_PARM_P and
simplify accordingly.

gcc/testsuite/ChangeLog:

PR c++/96197
* g++.dg/cpp1y/constexpr-array8.C: New test.
---
 gcc/cp/cp-gimplify.c  |  

Re: [PATCH 6/7] ivopts: Add handlings for vector with length IFNs

2020-07-22 Thread Richard Sandiford
"Kewen.Lin via Gcc-patches"  writes:
> gcc/ChangeLog
>
> 2020-MM-DD  Kewen Lin  
>
>   * tree-ssa-loop-ivopts.c (get_mem_type_for_internal_fn): Handle
>   IFN_LEN_LOAD and IFN_LEN_STORE.
>   (get_alias_ptr_type_for_ptr_address): Likewise.

OK, thanks.

(Sorry, hadn't realised that this was still awaiting review.)

Richard
>
>
> ---
>  gcc/tree-ssa-loop-ivopts.c | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
> index 1d2697ae1ba..45b31640e75 100644
> --- a/gcc/tree-ssa-loop-ivopts.c
> +++ b/gcc/tree-ssa-loop-ivopts.c
> @@ -2436,12 +2436,14 @@ get_mem_type_for_internal_fn (gcall *call, tree *op_p)
>  {
>  case IFN_MASK_LOAD:
>  case IFN_MASK_LOAD_LANES:
> +case IFN_LEN_LOAD:
>if (op_p == gimple_call_arg_ptr (call, 0))
>   return TREE_TYPE (gimple_call_lhs (call));
>return NULL_TREE;
>  
>  case IFN_MASK_STORE:
>  case IFN_MASK_STORE_LANES:
> +case IFN_LEN_STORE:
>if (op_p == gimple_call_arg_ptr (call, 0))
>   return TREE_TYPE (gimple_call_arg (call, 3));
>return NULL_TREE;
> @@ -7415,6 +7417,8 @@ get_alias_ptr_type_for_ptr_address (iv_use *use)
>  case IFN_MASK_STORE:
>  case IFN_MASK_LOAD_LANES:
>  case IFN_MASK_STORE_LANES:
> +case IFN_LEN_LOAD:
> +case IFN_LEN_STORE:
>/* The second argument contains the correct alias type.  */
>gcc_assert (use->op_p = gimple_call_arg_ptr (call, 0));
>return TREE_TYPE (gimple_call_arg (call, 1));


[PATCH] [og10] Fix goacc/note-parallelism-combined-kernels-loop-auto.c test

2020-07-22 Thread Kwok Cheung Yeung

Hello

This test currently fails because some expected 'optimized: assigned OpenACC seq 
loop parallelism' messages are not being printed by the compiler. This is 
because the '.auto. conflicts with other OpenACC loop specifiers' errors are 
generated in the earlier omp-low pass, whereas the messages are emitted in the 
later oacc_loop_designation pass. The errors in the earlier pass means that the 
later pass is never executed.


In 'Update expected messages, errors and warnings for "kernels" tests' (commit 
081a01963ca8db7ddaaf5871d281321454fd3246), dg-messages for 'optimized: assigned 
OpenACC .* parallelism' messages are removed from 
note-parallelism-1-kernels-loop-auto.c, 
note-parallelism-combined-kernels-loop-auto.c and 
note-parallelism-kernels-loop-auto.c. I suppose the remaining entries in 
note-parallelism-combined-kernels-loop-auto.c were just missed by accident?


This patch removes the dg-messages for the no-longer occurring messages.

Okay for OG10?

Kwok
From 59c6bc996000fb69ee8cf2cee3ec2e279524e66f Mon Sep 17 00:00:00 2001
From: Kwok Cheung Yeung 
Date: Thu, 18 Jun 2020 13:32:40 -0700
Subject: [PATCH 1/6] Fix failures in
 c-c++-common/goacc/note-parallelism-combined-kernels-loop-auto.c testcase

2020-07-21  Kwok Cheung Yeung  

gcc/testsuite/
* c-c++-common/goacc/note-parallelism-combined-kernels-loop-auto.c
(main): Remove obsolete expected messages.
---
 .../goacc/note-parallelism-combined-kernels-loop-auto.c  | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git 
a/gcc/testsuite/c-c++-common/goacc/note-parallelism-combined-kernels-loop-auto.c
 
b/gcc/testsuite/c-c++-common/goacc/note-parallelism-combined-kernels-loop-auto.c
index 806ccc7..b1ef3c0 100644
--- 
a/gcc/testsuite/c-c++-common/goacc/note-parallelism-combined-kernels-loop-auto.c
+++ 
b/gcc/testsuite/c-c++-common/goacc/note-parallelism-combined-kernels-loop-auto.c
@@ -11,7 +11,7 @@ main ()
 {
   int x, y, z;
 
-#pragma acc kernels loop /* { dg-message "optimized: assigned OpenACC seq loop 
parallelism" } */
+#pragma acc kernels loop
   /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. 
construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
   for (x = 0; x < 10; x++)
 ;
@@ -60,19 +60,19 @@ main ()
   for (z = 0; z < 10; z++)
;
 
-#pragma acc kernels loop auto /* { dg-message "optimized: assigned OpenACC seq 
loop parallelism" } */
+#pragma acc kernels loop auto
   /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. 
construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
   for (x = 0; x < 10; x++)
 ;
 
-#pragma acc kernels loop auto /* { dg-message "optimized: assigned OpenACC seq 
loop parallelism" } */
+#pragma acc kernels loop auto
   /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. 
construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
   for (x = 0; x < 10; x++)
 #pragma acc loop auto
 for (y = 0; y < 10; y++)
   ;
 
-#pragma acc kernels loop auto /* { dg-message "optimized: assigned OpenACC seq 
loop parallelism" } */
+#pragma acc kernels loop auto
   /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. 
construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
   for (x = 0; x < 10; x++)
 #pragma acc loop auto
@@ -81,7 +81,7 @@ main ()
   for (z = 0; z < 10; z++)
;
 
-#pragma acc kernels loop /* { dg-message "optimized: assigned OpenACC seq loop 
parallelism" } */
+#pragma acc kernels loop
   /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. 
construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
   for (x = 0; x < 10; x++)
 #pragma acc loop auto
@@ -90,7 +90,7 @@ main ()
   for (z = 0; z < 10; z++)
;
 
-#pragma acc kernels loop auto /* { dg-message "optimized: assigned OpenACC seq 
loop parallelism" } */
+#pragma acc kernels loop auto
   /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. 
construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
   for (x = 0; x < 10; x++)
 #pragma acc loop
@@ -99,7 +99,7 @@ main ()
   for (z = 0; z < 10; z++)
;
 
-#pragma acc kernels loop auto /* { dg-message "optimized: assigned OpenACC seq 
loop parallelism" } */
+#pragma acc kernels loop auto
   /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. 
construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
   for (x = 0; x < 10; x++)
 #pragma acc loop auto
@@ -108,7 +108,7 @@ main ()
   for (z = 0; z < 10; z++)
;
 
-#pragma acc kernels loop /* { dg-message "optimized: assigned OpenACC seq loop 
parallelism" } */
+#pragma acc kernels loop
   /* { dg-message "optimized: forwarded loop nest in OpenACC .kernels. 
construct to .parloops. for analysis" "" { target *-*-* } .-1 } */
   for (x = 0; x < 10; x++)
 #pragma acc loop auto
-- 
2.8.1



Re: [PATCH 1/2] Add new RTX instruction class FILLER_INSN

2020-07-22 Thread Richard Biener via Gcc-patches
On Wed, Jul 22, 2020 at 12:03 PM Andrea Corallo  wrote:
>
> Hi all,
>
> I'd like to submit the following two patches implementing a new AArch64
> specific back-end pass that helps optimize branch-dense code, which can
> be a bottleneck for performance on some Arm cores.  This is achieved by
> padding out the branch-dense sections of the instruction stream with
> nops.
>
> The original patch was already posted some time ago:
>
> https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg200721.html
>
> This follows up splitting as suggested in two patches, rebasing on
> master and implementing the suggestions of the first code review.
>
> This first patch implements the addition of a new RTX instruction class
> FILLER_INSN, which has been white listed to allow placement of NOPs
> outside of a basic block.  This is to allow padding after unconditional
> branches.  This is favorable so that any performance gained from
> diluting branches is not paid straight back via excessive eating of
> nops.
>
> It was deemed that a new RTX class was less invasive than modifying
> behavior in regards to standard UNSPEC nops.
>
> 1/2 is requirement for 2/2.  Please see this the cover letter of this last
> for more details on the pass itself.

I wonder if such effect of instructions on the pipeline can be modeled
in the DFA and thus whether the scheduler could issue (always ready)
NOPs?

I also wonder whether such optimization is better suited for the assembler
which should know instruction lengths and alignment in a more precise
way and also would know whether extra nops make immediates too large
for pc relative things like short branches or section anchor accesses
(or whatever else)?

Richard.

> Regards
>
>   Andrea
>
> gcc/ChangeLog
>
> 2020-07-17  Andrea Corallo  
> Carey Williams  
>
> * cfgbuild.c (inside_basic_block_p): Handle FILLER_INSN.
> * cfgrtl.c (rtl_verify_bb_layout): Whitelist FILLER_INSN outside
> basic blocks.
> * coretypes.h: New rtx class.
> * emit-rtl.c (emit_filler_after): New function.
> * rtl.def (FILLER_INSN): New rtl define.
> * rtl.h (rtx_filler_insn): Define new structure.
> (FILLER_INSN_P): New macro.
> (is_a_helper ::test): New test helper for
> rtx_filler_insn.
> (emit_filler_after): New extern.
> * target-insns.def: Add target insn definition.


Re: [PATCH v2] dse: Remove partial load after full store for high part access[PR71309]

2020-07-22 Thread Richard Sandiford
luoxhu  writes:
> Hi,
>
> On 2020/7/21 23:30, Richard Sandiford wrote:
>> Xiong Hu Luo  writes:>> @@ -1872,9 +1872,27 @@ 
>> get_stored_val (store_info *store_info, machine_mode read_mode,
>>>   {
>>> poly_int64 shift = gap * BITS_PER_UNIT;
>>> poly_int64 access_size = GET_MODE_SIZE (read_mode) + gap;
>>> -  read_reg = find_shift_sequence (access_size, store_info, read_mode,
>>> - shift, optimize_bb_for_speed_p (bb),
>>> - require_cst);
>>> +  rtx rhs_subreg = NULL;
>>> +
>>> +  if (known_eq (GET_MODE_BITSIZE (store_mode), shift * 2))
>>> +   {
>>> + scalar_int_mode inner_mode = smallest_int_mode_for_size (shift);
>>> + poly_uint64 sub_off
>>> +   = ((!BYTES_BIG_ENDIAN)
>>> +? GET_MODE_SIZE (store_mode) - GET_MODE_SIZE (inner_mode)
>>> +: 0);
>>> +
>>> + rhs_subreg = simplify_gen_subreg (inner_mode, store_info->rhs,
>>> +   store_mode, sub_off);
>>> + if (rhs_subreg)
>>> +   read_reg
>>> + = extract_low_bits (read_mode, inner_mode, copy_rtx (rhs_subreg));
>>> +   }
>>> +
>>> +  if (read_reg == NULL)
>>> +   read_reg
>>> + = find_shift_sequence (access_size, store_info, read_mode, shift,
>>> +optimize_bb_for_speed_p (bb), require_cst);
>> 
>> Did you consider doing this in find_shift_sequence instead?
>> ISTM that this is really using subregs to optimise:
>> 
>>/* In theory we could also check for an ashr.  Ian Taylor knows
>>   of one dsp where the cost of these two was not the same.  But
>>   this really is a rare case anyway.  */
>>target = expand_binop (new_mode, lshr_optab, new_reg,
>>   gen_int_shift_amount (new_mode, shift),
>>   new_reg, 1, OPTAB_DIRECT);
>> 
>> I think everything up to:
>> 
>>/* Also try a wider mode if the necessary punning is either not
>>   desirable or not possible.  */
>>if (!CONSTANT_P (store_info->rhs)
>>&& !targetm.modes_tieable_p (new_mode, store_mode))
>>  continue;
>> 
>> is either neutral or helpful for the subreg case too, so maybe
>> we could just add the optimisation after that.  (It probably isn't
>> worth reusing any of the later loop body code though, since the
>> subreg case is much simpler.)
>> 
>> I don't think we need to restrict this case to modes of size
>> shift * 2.  We can just check whether the shift is a multiple of
>> the new_mode calculated by find_shift_sequence (using multiple_p).
>> 
>> An easier way of converting the shift to a subreg byte offset
>> is to use subreg_offset_from_lsb, which also handles
>> BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN.
>> 
>
> Thanks, I've updated the patch by moving it into find_shift_sequence.
> Not sure whether meets your comments precisely though it still works:)
> There is a comment mentioned that 
>
>   /* Some machines like the x86 have shift insns for each size of
>  operand.  Other machines like the ppc or the ia-64 may only have
>  shift insns that shift values within 32 or 64 bit registers.
>  This loop tries to find the smallest shift insn that will right
>  justify the value we want to read but is available in one insn on
>  the machine.  */
>
> So it will early break without some additional check as the new_mode is
> TImode here:
>
>   if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
>   break;
>
>
>
> [PATCH v2] dse: Remove partial load after full store for high part 
> access[PR71309]
>
>
> This patch could optimize (works for char/short/int/void*):
>
> 6: r119:TI=[r118:DI+0x10]
> 7: [r118:DI]=r119:TI
> 8: r121:DI=[r118:DI+0x8]
>
> =>
>
> 6: r119:TI=[r118:DI+0x10]
> 18: r122:TI=r119:TI
> 16: r123:TI#0=r122:TI#8 0>>0
> 17: r123:TI#8=0
> 19: r124:DI=r123:TI#0
> 7: [r118:DI]=r119:TI
> 8: r121:DI=r124:DI
>
> Final ASM will be as below without partial load after full store(stxv+ld):
>   mr 9,3
>   ld 3,24(3)
>   ld 10,16(3)
>   std 3,8(9)
>   std 10,0(9)
>   blr
>
> It could achieve ~25% performance improvement for typical cases on
> Power9.  Bootstrap and regression testing on Power9-LE.
>
> For AArch64, one ldr is replaced by mov:
>
> ldp x2, x3, [x0, 16]
> stp x2, x3, [x0]
> ldr x0, [x0, 8]
>
> =>
>
> mov x1, x0
> ldp x2, x0, [x0, 16]
> stp x2, x0, [x1]
>
> gcc/ChangeLog:
>
> 2020-07-22  Xionghu Luo  
>
>   PR rtl-optimization/71309
>   * dse.c (find_shift_sequence): Use subreg of shifted from high part
>   register to avoid loading from address.
>
> gcc/testsuite/ChangeLog:
>
> 2020-07-22  Xionghu Luo  
>
>   PR rtl-optimization/71309
>   * gcc.target/powerpc/pr71309.c: New test.
> ---
>  gcc/dse.c  | 15 +-
>  gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++
>  2 files changed, 47 insertions(+), 1 deletion(-)
>  create mode 100644 

Re: [PATCH 2/2] Aarch64: Add branch diluter pass

2020-07-22 Thread Andrew Pinski via Gcc-patches
On Wed, Jul 22, 2020 at 3:10 AM Andrea Corallo  wrote:
>
> Hi all,
>
> this second patch implements the AArch64 specific back-end pass
> 'branch-dilution' controllable by the followings command line options:
>
> -mbranch-dilution
>
> --param=aarch64-branch-dilution-granularity={num}
>
> --param=aarch64-branch-dilution-max-branches={num}
>
> Some cores known to be able to benefit from this pass have been given
> default tuning values for their granularity and max-branches.  Each
> affected core has a very specific granule size and associated max-branch
> limit.  This is a microarchitecture specific optimization.  Typical
> usage should be -mbranch-dilution with a specified -mcpu.  Cores with a
> granularity tuned to 0 will be ignored. Options are provided for
> experimentation.

Can you give a simple example of what this patch does?
Also your testcases seem too sensitive to other optimizations which
could happen.  E.g. the call to "branch (i)" could be pulled out of
the switch statement.  Or even the "*i += N;" could be moved to one
Basic block and the switch becomes just one if statement.

> Observed performance improvements on Neoverse N1 SPEC CPU 2006 where
> up to ~+3% (xalancbmk) and ~+1.5% (sjeng).  Average code size increase
> for all the testsuite proved to be ~0.4%.

Also does this improve any non-SPEC benchmarks or has it only been
benchmarked with SPEC?

A few comments about the patch itself:
> --- a/gcc/config.gcc
> +++ b/gcc/config.gcc
> @@ -321,7 +321,7 @@ aarch64*-*-*)
>  c_target_objs="aarch64-c.o"
>  cxx_target_objs="aarch64-c.o"
>  d_target_objs="aarch64-d.o"
> - extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o 
> aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o 
> aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o 
> falkor-tag-collision-avoidance.o aarch64-bti-insert.o"
> + extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o 
> aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o 
> aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o 
> falkor-tag-collision-avoidance.o aarch64-bti-insert.o 
> aarch64-branch-dilution.o"
>  target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.c 
> \$(srcdir)/config/aarch64/aarch64-sve-builtins.h 
> \$(srcdir)/config/aarch64/aarch64-sve-builtins.cc"
>  target_has_targetm_common=yes
>  ;;

I think it is time to change how extra_objs is done and split it
across a few lines; this should be done in a different patch, it will
simplify future additions later on.

+unsigned max_branch = 0;
+unsigned granule_size = 0;

These two really should be part of the insn_granule class.  Having
global variables is not a good idea with respect to threading of GCC.

+  dump_printf (MSG_NOTE,
+"%d. %s%s%s  > NEXT = (%d), PREV = (%d) -- UID: %d\n",
+insn->index, GET_RTX_NAME (GET_CODE (insn->rtx)),
+any_uncondjump_p (insn->rtx) ? " (ubranch)" : "",
+insn->is_nop ? " (nop)" : "",
+insn->next ? insn->next->index : -1,
+insn->prev ? insn->prev->index : -1, INSN_UID (insn->rtx));

This part should really be of a method of insn_info instead.

is_branch (insn->rtx)

Why not:
insn->is_branch ()

This simplifies the code and really shows what is being done.

+  if (is_branch (prev_real_nondebug_insn (insn->rtx))
+  && is_branch (next_real_nondebug_insn (insn->rtx)))

Maybe:
+  if (is_branch (insn->prev_real_nondebug_insn ())
+  && is_branch (insn->next_real_nondebug_insn ()))

+  while (current_insn && !current_insn->is_branch)
+{
+  current_insn = current_insn->next;
+}

Why not:
current_insn = current_insn->next_branch_insn ();

There are many more examples of where you can improve like the above;
that is the way you define insn_info can be improved and push some of
the implementation back into the insn_info definition.

Thanks,
Andrew

>
> * Algorithm and Heuristic
>
> The pass takes a very simple 'sliding window' approach to the problem.
> We crawl through each instruction (starting at the first branch) and
> keep track of the number of branches within the current "granule" (or
> window).  When this exceeds the max-branch value, the pass will dilute
> the current granule, inserting nops to push out some of the branches.
> The heuristic will favor unconditonal branches (for performance
> reasons), or branches that are between two other branches (in order to
> decrease the likelihood of another dilution call being needed).
>
> Each branch type required a different method for nop insertion due to
> RTL/basic_block restrictions:
>
> - Returning calls do not end a basic block so can be handled by
>   emitting a generic nop.
>
> - Unconditional branches must be the end of a basic block, and nops
>   cannot be outside of a basic block.  Thus the need for FILLER_INSN,
>   which allows placement outside of a basic block
>
> - and translates to a nop.
>
> - For most conditional branches we've taken a simple approach and only
>   handle the 

Re: [Patch] OpenMP: Fixes for omp critical + hint

2020-07-22 Thread Tobias Burnus

Now moved to libgomp, cf. attachment.

Tobias

On 7/22/20 11:16 AM, Jakub Jelinek wrote:

On Wed, Jul 22, 2020 at 11:09:06AM +0200, Thomas Schwinge wrote:

So I suppose you'll either have to put these testcases into 'libgomp', or
we'll have to invent something else?

Indeed.


Jakub, is there a reason why for
build-tree testing we can't just add '-I[build-tree]/libgomp' etc. in
'gcc.dg/gomp/gomp.exp' etc.?

I guess historic reasons.  E.g. g++.dg/ adds those and -L for libstdc++ too,
but then most of the C++ tests that test primarily the compiler and
sometimes use headers and even more often the runtime library are there.
On the gomp side, libgomp/testsuite has been used for both compile and
link/runtime tests that need the runtime library and its headers, while
gcc/testsuite/*/gomp/ has been left for tests that don't need any of those.

  Jakub


-
Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München / Germany
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Alexander 
Walter
commit ade6e7204ce4d179cd9fa4637ddee85ba1fa12d9
Author: Tobias Burnus 
Date:   Wed Jul 22 12:12:48 2020 +0200

critical-hint-*.{c,f90}: Move from gcc/testsuite to libgomp/testsuite

libgomp/ChangeLog:

* testsuite/libgomp.c-c++-common/critical-hint-1.c: New; moved from
gcc/testsuite/c-c++-common/gomp/.
* testsuite/libgomp.c-c++-common/critical-hint-2.c: Likewise.
* testsuite/libgomp.fortran/critical-hint-1.f90: New; moved
from gcc/testsuite/gfortran.dg/gomp/.
* testsuite/libgomp.fortran/critical-hint-2.f90: Likewise.

gcc/testsuite/ChangeLog:

* c-c++-common/gomp/critical-hint-1.c: Moved to libgomp/.
* c-c++-common/gomp/critical-hint-2.c: Moved to libgomp/.
* gfortran.dg/gomp/critical-hint-1.f90: Moved to libgomp/.
* gfortran.dg/gomp/critical-hint-2.f90: Moved to libgomp/.
---
 .../gomp => libgomp/testsuite/libgomp.c-c++-common}/critical-hint-1.c   | 2 ++
 .../gomp => libgomp/testsuite/libgomp.c-c++-common}/critical-hint-2.c   | 1 +
 .../gomp => libgomp/testsuite/libgomp.fortran}/critical-hint-1.f90  | 2 ++
 .../gomp => libgomp/testsuite/libgomp.fortran}/critical-hint-2.f90  | 1 +
 4 files changed, 6 insertions(+)

diff --git a/gcc/testsuite/c-c++-common/gomp/critical-hint-1.c b/libgomp/testsuite/libgomp.c-c++-common/critical-hint-1.c
similarity index 98%
rename from gcc/testsuite/c-c++-common/gomp/critical-hint-1.c
rename to libgomp/testsuite/libgomp.c-c++-common/critical-hint-1.c
index 510f8abef80..1e49747477b 100644
--- a/gcc/testsuite/c-c++-common/gomp/critical-hint-1.c
+++ b/libgomp/testsuite/libgomp.c-c++-common/critical-hint-1.c
@@ -1,3 +1,5 @@
+/* { dg-do compile } */
+
 #include 
 
 void
diff --git a/gcc/testsuite/c-c++-common/gomp/critical-hint-2.c b/libgomp/testsuite/libgomp.c-c++-common/critical-hint-2.c
similarity index 98%
rename from gcc/testsuite/c-c++-common/gomp/critical-hint-2.c
rename to libgomp/testsuite/libgomp.c-c++-common/critical-hint-2.c
index effe24a63ec..057353b6ce2 100644
--- a/gcc/testsuite/c-c++-common/gomp/critical-hint-2.c
+++ b/libgomp/testsuite/libgomp.c-c++-common/critical-hint-2.c
@@ -1,3 +1,4 @@
+/* { dg-do compile } */
 /* { dg-additional-options "-fdump-tree-original" } */
 #include 
 
diff --git a/gcc/testsuite/gfortran.dg/gomp/critical-hint-1.f90 b/libgomp/testsuite/libgomp.fortran/critical-hint-1.f90
similarity index 99%
rename from gcc/testsuite/gfortran.dg/gomp/critical-hint-1.f90
rename to libgomp/testsuite/libgomp.fortran/critical-hint-1.f90
index c26b617f1bd..225d9a7a221 100644
--- a/gcc/testsuite/gfortran.dg/gomp/critical-hint-1.f90
+++ b/libgomp/testsuite/libgomp.fortran/critical-hint-1.f90
@@ -1,3 +1,5 @@
+! { dg-do compile }
+
 subroutine example_criticial ()
   use omp_lib
   implicit none
diff --git a/gcc/testsuite/gfortran.dg/gomp/critical-hint-2.f90 b/libgomp/testsuite/libgomp.fortran/critical-hint-2.f90
similarity index 99%
rename from gcc/testsuite/gfortran.dg/gomp/critical-hint-2.f90
rename to libgomp/testsuite/libgomp.fortran/critical-hint-2.f90
index 15d6206a438..f34680ca718 100644
--- a/gcc/testsuite/gfortran.dg/gomp/critical-hint-2.f90
+++ b/libgomp/testsuite/libgomp.fortran/critical-hint-2.f90
@@ -1,3 +1,4 @@
+! { dg-do compile }
 ! { dg-additional-options "-fdump-tree-original" }
 subroutine example_criticial ()
   use omp_lib


Re: [PATCH] PR target/96260 - KASAN should work even back-end not porting anything.

2020-07-22 Thread Jakub Jelinek via Gcc-patches
On Wed, Jul 22, 2020 at 04:53:00PM +0800, Kito Cheng wrote:
> --- a/gcc/asan.c
> +++ b/gcc/asan.c
> @@ -344,6 +344,12 @@ asan_shadow_offset ()
>return asan_shadow_offset_value;
>  }
>  
> +/* Returns Asan shadow offset has been set.  */
> +bool asan_shadow_offset_set_p ()

Formatting.  Should be
bool
asan_shadow_offset_set_p ()

> +{
> +  return asan_shadow_offset_computed;
> +}
> +
>  alias_set_type asan_shadow_set = -1;
>  
>  /* Pointer types to 1, 2 or 4 byte integers in shadow memory.  A separate

> -/* { dg-warning ".'-fsanitize=address' and '-fsanitize=kernel-address' are 
> not supported for this target" "" { target *-*-* } 0 } */
> +/* { dg-warning ".'-fsanitize=kernel-address' with stack protection is not 
> supported without '-fasan-shadow-offset=' for this target." "" { target *-*-* 
> } 0 } */

Please adjust, see below.
> index 95eea63380f6..48f13d282c52 100644
> --- a/gcc/toplev.c
> +++ b/gcc/toplev.c
> @@ -1835,7 +1835,7 @@ process_options (void)
>/* Address Sanitizer needs porting to each target architecture.  */
>  
>if ((flag_sanitize & SANITIZE_ADDRESS)
> -  && (!FRAME_GROWS_DOWNWARD || targetm.asan_shadow_offset == NULL))
> +  && !FRAME_GROWS_DOWNWARD)
>  {
>warning_at (UNKNOWN_LOCATION, 0,
> "%<-fsanitize=address%> and %<-fsanitize=kernel-address%> "
> @@ -1843,6 +1843,25 @@ process_options (void)
>flag_sanitize &= ~SANITIZE_ADDRESS;
>  }
>  
> +  if ((flag_sanitize & SANITIZE_USER_ADDRESS)
> +  && targetm.asan_shadow_offset == NULL)
> +{
> +  warning_at (UNKNOWN_LOCATION, 0,
> +   "%<-fsanitize=address%> not supported for this target");
> +  flag_sanitize &= ~SANITIZE_ADDRESS;
> +}
> +
> +  if ((flag_sanitize & SANITIZE_KERNEL_ADDRESS)
> +  && (targetm.asan_shadow_offset == NULL && param_asan_stack
> +   && !asan_shadow_offset_set_p ()))

Formatting.  If there are several & (or ||s) and it doesn't fit on one
line, each of them should be on a separate line, so there should be a
newline and indentation instead of space before "&& param_asan_stack".
> +{
> +  warning_at (UNKNOWN_LOCATION, 0,
> +   "%<-fsanitize=kernel-address%> with stack protection "
> +   "is not supported without %<-fasan-shadow-offset=%> "
> +   "for this target.");

No full stop at the end of diagnostics (plus adjust testcase for it).

Otherwise LGTM for trunk and 10.3 (see Richi's mail).

Jakub



[PATCH 2/2] Aarch64: Add branch diluter pass

2020-07-22 Thread Andrea Corallo
Hi all,

this second patch implements the AArch64 specific back-end pass
'branch-dilution' controllable by the followings command line options:

-mbranch-dilution

--param=aarch64-branch-dilution-granularity={num}

--param=aarch64-branch-dilution-max-branches={num}

Some cores known to be able to benefit from this pass have been given
default tuning values for their granularity and max-branches.  Each
affected core has a very specific granule size and associated max-branch
limit.  This is a microarchitecture specific optimization.  Typical
usage should be -mbranch-dilution with a specified -mcpu.  Cores with a
granularity tuned to 0 will be ignored. Options are provided for
experimentation.

Observed performance improvements on Neoverse N1 SPEC CPU 2006 where
up to ~+3% (xalancbmk) and ~+1.5% (sjeng).  Average code size increase
for all the testsuite proved to be ~0.4%.

* Algorithm and Heuristic

The pass takes a very simple 'sliding window' approach to the problem.
We crawl through each instruction (starting at the first branch) and
keep track of the number of branches within the current "granule" (or
window).  When this exceeds the max-branch value, the pass will dilute
the current granule, inserting nops to push out some of the branches.
The heuristic will favor unconditonal branches (for performance
reasons), or branches that are between two other branches (in order to
decrease the likelihood of another dilution call being needed).

Each branch type required a different method for nop insertion due to
RTL/basic_block restrictions:

- Returning calls do not end a basic block so can be handled by
  emitting a generic nop.

- Unconditional branches must be the end of a basic block, and nops
  cannot be outside of a basic block.  Thus the need for FILLER_INSN,
  which allows placement outside of a basic block

- and translates to a nop.

- For most conditional branches we've taken a simple approach and only
  handle the fallthru edge for simplicity, which we do by inserting a
  "nop block" of nops on the fallthru edge, mapping that back to the
  original destination block.

- asm gotos and pcsets are going to be tricky to analyze from a
  dilution perspective so are ignored at present.

* Testing

The two patches has been tested together on top of current master on
aarch64-unknown-linux-gnu as follow:

- Successful compilation of 3 stage bootstrap with the
  pass forced on (for stage 2, 3)

- No additional compilation failures (SPEC CPU 2006 and SPEC CPU 2017)

- No 'make check' regressions


Regards

  Andrea

gcc/ChangeLog

2020-07-17  Andrea Corallo  
Carey Williams  

* config.gcc (extra_objs): Add aarch64-branch-dilution.o.
* config/aarch64/aarch64-branch-dilution.c: New file.
* config/aarch64/aarch64-passes.def (branch-dilution): Register
pass.
* config/aarch64/aarch64-protos.h (struct tune_params): Declare
tuning parameters bdilution_gsize and bdilution_maxb.
(make_pass_branch_dilution): New declaration.
* config/aarch64/aarch64.c (generic_tunings, cortexa35_tunings)
(cortexa53_tunings, cortexa57_tunings, cortexa72_tunings)
(cortexa73_tunings, exynosm1_tunings, thunderxt88_tunings)
(thunderx_tunings, tsv110_tunings, xgene1_tunings)
(qdf24xx_tunings, saphira_tunings, thunderx2t99_tunings)
(neoversen1_tunings): Provide default tunings for bdilution_gsize
and bdilution_maxb.
* config/aarch64/aarch64.md (filler_insn): Define new insn.
* config/aarch64/aarch64.opt (-mbranch-dilution)
(--param=aarch64-branch-dilution-granularity)
(--param=aarch64-branch-dilution-max-branches): Add new options.
* config/aarch64/t-aarch64 (aarch64-branch-dilution.c): New rule
for aarch64-branch-dilution.c.
* doc/invoke.texi (-mbranch-dilution)
(--param=aarch64-branch-dilution-granularity)
(--param=aarch64-branch-dilution-max-branches): Document branch
dilution options.

gcc/testsuite/ChangeLog

2020-07-17  Andrea Corallo  
Carey Williams  

* gcc.target/aarch64/branch-dilution-off.c: New file.
* gcc.target/aarch64/branch-dilution-on.c: New file.
>From 386b3a3131d5f03a4c9fb8ee47b321009f17fab5 Mon Sep 17 00:00:00 2001
From: Andrea Corallo 
Date: Thu, 16 Jul 2020 09:24:33 +0100
Subject: [PATCH 2/2] Aarch64: Add branch diluter pass

gcc/ChangeLog

2020-07-17  Andrea Corallo  
	Carey Williams  

	* config.gcc (extra_objs): Add aarch64-branch-dilution.o.
	* config/aarch64/aarch64-branch-dilution.c: New file.
	* config/aarch64/aarch64-passes.def (branch-dilution): Register
	pass.
* config/aarch64/aarch64-protos.h (struct tune_params): Declare
	tuning parameters bdilution_gsize and bdilution_maxb.
(make_pass_branch_dilution): New declaration.
* config/aarch64/aarch64.c (generic_tunings, cortexa35_tunings)
(cortexa53_tunings, cortexa57_tunings, cortexa72_tunings)
 

[PATCH 1/2] Add new RTX instruction class FILLER_INSN

2020-07-22 Thread Andrea Corallo
Hi all,

I'd like to submit the following two patches implementing a new AArch64
specific back-end pass that helps optimize branch-dense code, which can
be a bottleneck for performance on some Arm cores.  This is achieved by
padding out the branch-dense sections of the instruction stream with
nops.

The original patch was already posted some time ago:

https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg200721.html

This follows up splitting as suggested in two patches, rebasing on
master and implementing the suggestions of the first code review.

This first patch implements the addition of a new RTX instruction class
FILLER_INSN, which has been white listed to allow placement of NOPs
outside of a basic block.  This is to allow padding after unconditional
branches.  This is favorable so that any performance gained from
diluting branches is not paid straight back via excessive eating of
nops.

It was deemed that a new RTX class was less invasive than modifying
behavior in regards to standard UNSPEC nops.

1/2 is requirement for 2/2.  Please see this the cover letter of this last
for more details on the pass itself.

Regards

  Andrea

gcc/ChangeLog

2020-07-17  Andrea Corallo  
Carey Williams  

* cfgbuild.c (inside_basic_block_p): Handle FILLER_INSN.
* cfgrtl.c (rtl_verify_bb_layout): Whitelist FILLER_INSN outside
basic blocks.
* coretypes.h: New rtx class.
* emit-rtl.c (emit_filler_after): New function.
* rtl.def (FILLER_INSN): New rtl define.
* rtl.h (rtx_filler_insn): Define new structure.
(FILLER_INSN_P): New macro.
(is_a_helper ::test): New test helper for
rtx_filler_insn.
(emit_filler_after): New extern.
* target-insns.def: Add target insn definition.
>From 475bbb3984ed133b020b344eebc2d4d3bf8ce52f Mon Sep 17 00:00:00 2001
From: Andrea Corallo 
Date: Thu, 16 Jul 2020 09:21:38 +0100
Subject: [PATCH 1/2] Add new RTX instruction class FILLER_INSN

gcc/ChangeLog

2020-07-17  Andrea Corallo  
	Carey Williams  

	* cfgbuild.c (inside_basic_block_p): Handle FILLER_INSN.
	* cfgrtl.c (rtl_verify_bb_layout): Whitelist FILLER_INSN outside
	basic blocks.
	* coretypes.h: New rtx class.
	* emit-rtl.c (emit_filler_after): New function.
	* rtl.def (FILLER_INSN): New rtl define.
	* rtl.h (rtx_filler_insn): Define new structure.
	(FILLER_INSN_P): New macro.
	(is_a_helper ::test): New test helper for
	rtx_filler_insn.
	(emit_filler_after): New extern.
	* target-insns.def: Add target insn definition.
---
 gcc/cfgbuild.c   |  1 +
 gcc/cfgrtl.c | 16 +++-
 gcc/coretypes.h  |  1 +
 gcc/emit-rtl.c   | 14 ++
 gcc/rtl.def  |  4 
 gcc/rtl.h| 23 +++
 gcc/target-insns.def |  1 +
 7 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/gcc/cfgbuild.c b/gcc/cfgbuild.c
index 478afa5fe91c..07cb06afba07 100644
--- a/gcc/cfgbuild.c
+++ b/gcc/cfgbuild.c
@@ -58,6 +58,7 @@ inside_basic_block_p (const rtx_insn *insn)
 
 case JUMP_TABLE_DATA:
 case BARRIER:
+case FILLER_INSN:
 case NOTE:
   return false;
 
diff --git a/gcc/cfgrtl.c b/gcc/cfgrtl.c
index 827e84a44ddd..02139aaa268d 100644
--- a/gcc/cfgrtl.c
+++ b/gcc/cfgrtl.c
@@ -61,6 +61,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "cfgloop.h"
 #include "tree-pass.h"
 #include "print-rtl.h"
+#include "rtl-iter.h"
 
 /* Disable warnings about missing quoting in GCC diagnostics.  */
 #if __GNUC__ >= 10
@@ -3033,7 +3034,20 @@ rtl_verify_bb_layout (void)
 	  break;
 
 	default:
-	  fatal_insn ("insn outside basic block", x);
+	  /* Allow nops after branches, via FILLER_INSN.  */
+	  bool fail = true;
+	  subrtx_iterator::array_type array;
+	  FOR_EACH_SUBRTX (iter, array, x, ALL)
+		{
+		  const_rtx rtx = *iter;
+		  if (GET_CODE (rtx) == FILLER_INSN)
+		{
+		  fail = false;
+		  break;
+		}
+		}
+	  if (fail)
+		fatal_insn ("insn outside basic block", x);
 	}
 	}
 
diff --git a/gcc/coretypes.h b/gcc/coretypes.h
index 6b6cfcdf210d..5c6633a815c5 100644
--- a/gcc/coretypes.h
+++ b/gcc/coretypes.h
@@ -84,6 +84,7 @@ struct rtx_def;
 struct rtx_call_insn;   /* CALL_P (X) */
 struct rtx_jump_table_data; /* JUMP_TABLE_DATA_P (X) */
 struct rtx_barrier; /* BARRIER_P (X) */
+struct rtx_filler_insn; /* FILLER_INSN_P (X) */
 struct rtx_code_label;  /* LABEL_P (X) */
 struct rtx_note;/* NOTE_P (X) */
 
diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c
index f9b0e9714d9e..76f25c011b2a 100644
--- a/gcc/emit-rtl.c
+++ b/gcc/emit-rtl.c
@@ -4746,6 +4746,20 @@ emit_barrier_after (rtx_insn *after)
   return insn;
 }
 
+/* Make an insn of code FILLER_INSN to
+   pad out the instruction stream.
+   PATTERN should be from gen_filler_insn ().
+   AFTER will typically be an unconditional
+   branch at the end of a basic block.  */
+
+rtx_insn *
+emit_filler_after (rtx 

Re: [PATCH] PR target/96260 - KASAN should work even back-end not porting anything.

2020-07-22 Thread Richard Biener
On Wed, 22 Jul 2020, Kito Cheng wrote:

>  - Most KASAN function don't need any porting anything in back-end
>except asan stack protection.
> 
>  - However kernel will given shadow offset when enable asan stack
>protection, so eveything in KASAN can work if shadow offset is given.
> 
>  - Verified with x86 and risc-v.
> 
>  - Verified with RISC-V linux kernel.
> 
> OK for trunk and GCC 10 branch?

If it is approved please wait until after the GCC 10.2 release tomorrow.

Richard.

> gcc/ChangeLog:
> 
>   PR target/96260
>   * asan.c (asan_shadow_offset_set_p): New.
>   * asan.h (asan_shadow_offset_set_p): Ditto.
>   * toplev.c (process_options): Allow -fsanitize=kernel-address
>   even TARGET_ASAN_SHADOW_OFFSET not implemented, only check when
>   asan stack protection is enabled.
> 
> gcc/testsuite/ChangeLog:
> 
>   PR target/96260
>   * gcc.target/riscv/pr91441.c: Update warning message.
>   * gcc.target/riscv/pr96260.c: New.
> ---
>  gcc/asan.c   |  6 ++
>  gcc/asan.h   |  2 ++
>  gcc/testsuite/gcc.target/riscv/pr91441.c |  2 +-
>  gcc/testsuite/gcc.target/riscv/pr96260.c |  9 +
>  gcc/toplev.c | 21 -
>  5 files changed, 38 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/pr96260.c
> 
> diff --git a/gcc/asan.c b/gcc/asan.c
> index 9c9aa4cae358..2e759540246f 100644
> --- a/gcc/asan.c
> +++ b/gcc/asan.c
> @@ -344,6 +344,12 @@ asan_shadow_offset ()
>return asan_shadow_offset_value;
>  }
>  
> +/* Returns Asan shadow offset has been set.  */
> +bool asan_shadow_offset_set_p ()
> +{
> +  return asan_shadow_offset_computed;
> +}
> +
>  alias_set_type asan_shadow_set = -1;
>  
>  /* Pointer types to 1, 2 or 4 byte integers in shadow memory.  A separate
> diff --git a/gcc/asan.h b/gcc/asan.h
> index 9efd33f9b86b..114b457ef91c 100644
> --- a/gcc/asan.h
> +++ b/gcc/asan.h
> @@ -129,6 +129,8 @@ asan_var_and_redzone_size (unsigned HOST_WIDE_INT size)
>  
>  extern bool set_asan_shadow_offset (const char *);
>  
> +extern bool asan_shadow_offset_set_p ();
> +
>  extern void set_sanitized_sections (const char *);
>  
>  extern bool asan_sanitize_stack_p (void);
> diff --git a/gcc/testsuite/gcc.target/riscv/pr91441.c 
> b/gcc/testsuite/gcc.target/riscv/pr91441.c
> index 593a2972a0f0..2403c98bb703 100644
> --- a/gcc/testsuite/gcc.target/riscv/pr91441.c
> +++ b/gcc/testsuite/gcc.target/riscv/pr91441.c
> @@ -7,4 +7,4 @@ int *f( int a)
>  {
>return bar();
>  }
> -/* { dg-warning ".'-fsanitize=address' and '-fsanitize=kernel-address' are 
> not supported for this target" "" { target *-*-* } 0 } */
> +/* { dg-warning ".'-fsanitize=kernel-address' with stack protection is not 
> supported without '-fasan-shadow-offset=' for this target." "" { target *-*-* 
> } 0 } */
> diff --git a/gcc/testsuite/gcc.target/riscv/pr96260.c 
> b/gcc/testsuite/gcc.target/riscv/pr96260.c
> new file mode 100644
> index ..229997f877b7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/pr96260.c
> @@ -0,0 +1,9 @@
> +/* PR target/96260 */
> +/* { dg-do compile } */
> +/* { dg-options "--param asan-stack=1 -fsanitize=kernel-address 
> -fasan-shadow-offset=0x10" } */
> +
> +int *bar(int *);
> +int *f( int a)
> +{
> +  return bar();
> +}
> diff --git a/gcc/toplev.c b/gcc/toplev.c
> index 95eea63380f6..48f13d282c52 100644
> --- a/gcc/toplev.c
> +++ b/gcc/toplev.c
> @@ -1835,7 +1835,7 @@ process_options (void)
>/* Address Sanitizer needs porting to each target architecture.  */
>  
>if ((flag_sanitize & SANITIZE_ADDRESS)
> -  && (!FRAME_GROWS_DOWNWARD || targetm.asan_shadow_offset == NULL))
> +  && !FRAME_GROWS_DOWNWARD)
>  {
>warning_at (UNKNOWN_LOCATION, 0,
> "%<-fsanitize=address%> and %<-fsanitize=kernel-address%> "
> @@ -1843,6 +1843,25 @@ process_options (void)
>flag_sanitize &= ~SANITIZE_ADDRESS;
>  }
>  
> +  if ((flag_sanitize & SANITIZE_USER_ADDRESS)
> +  && targetm.asan_shadow_offset == NULL)
> +{
> +  warning_at (UNKNOWN_LOCATION, 0,
> +   "%<-fsanitize=address%> not supported for this target");
> +  flag_sanitize &= ~SANITIZE_ADDRESS;
> +}
> +
> +  if ((flag_sanitize & SANITIZE_KERNEL_ADDRESS)
> +  && (targetm.asan_shadow_offset == NULL && param_asan_stack
> +   && !asan_shadow_offset_set_p ()))
> +{
> +  warning_at (UNKNOWN_LOCATION, 0,
> +   "%<-fsanitize=kernel-address%> with stack protection "
> +   "is not supported without %<-fasan-shadow-offset=%> "
> +   "for this target.");
> +  flag_sanitize &= ~SANITIZE_ADDRESS;
> +}
> +
>   /* Do not use IPA optimizations for register allocation if profiler is 
> active
>  or patchable function entries are inserted for run-time instrumentation
>  or port does not emit prologue and epilogue as RTL.  */
> 

-- 
Richard Biener 

Re: [PATCH 1/3] expr: Allow scalar_int_mode target mode when converting a constant

2020-07-22 Thread Jozef Lawrynowicz
On Wed, Jul 22, 2020 at 09:33:47AM +0100, Richard Sandiford wrote:
> Jozef Lawrynowicz  writes:
> > is_int_mode does not allow MODE_PARTIAL_INT modes, so convert_modes was
> > not allowing a constant value to be converted to a MODE_PARTIAL_INT for
> > use as operand 2 in patterns such as ashlpsi3. The constant had
> > to be copied into a register before it could be used, but now can be
> > used directly as an operand without any copying.
> 
> Yeah.  I guess this dates back to when MODE_PARTIAL_INTs didn't have
> a known precision.

Is that what the section on MODE_PARTIAL_INT in the description for the
"subreg" RTX refers to?  From "14.8 Registers and Memory" of gccint:

  A MODE_PARTIAL_INT mode behaves as if it were as wide as the corresponding
  MODE_INT mode, except that it has an unknown number of undefined bits.

If so, that whole section seems out of date. I can work on getting it
fixed up.

> 
> > diff --git a/gcc/expr.c b/gcc/expr.c
> > index c7c3e9fd655..5a252f0935f 100644
> > --- a/gcc/expr.c
> > +++ b/gcc/expr.c
> > @@ -696,7 +696,7 @@ convert_modes (machine_mode mode, machine_mode oldmode, 
> > rtx x, int unsignedp)
> >  return x;
> >  
> >if (CONST_SCALAR_INT_P (x)
> > -  && is_int_mode (mode, _mode))
> > +  && is_a  (mode, _mode))
> >  {
> >/* If the caller did not tell us the old mode, then there is not
> >  much to do with respect to canonicalization.  We have to
> 
> I think we also need to change the condition in:
> 
>   /* If the caller did not tell us the old mode, then there is not
>much to do with respect to canonicalization.  We have to
>assume that all the bits are significant.  */
>   if (GET_MODE_CLASS (oldmode) != MODE_INT)
> 
> to is_a  (old_mode)
> 
> OK with that, thanks.

Thanks, I'll regtest that change and apply if all looks good.

Jozef
> 
> Richard


Re: [Patch] OpenMP: Fixes for omp critical + hint

2020-07-22 Thread Jakub Jelinek via Gcc-patches
On Wed, Jul 22, 2020 at 11:25:47AM +0200, Tobias Burnus wrote:
> FORTRAN MODULE
> Then I have the question why, e.g., "use openacc_kinds" in
> gfortran.dg/goacc/acc_on_device-2.f95 does work. What's different there?

Because that test defines its own module with that name:
module openacc_kinds
  implicit none

  integer, parameter :: acc_device_kind = 4

end module openacc_kinds
Ditto openacc module.

It is like e.g. many g++.dg/ tests which intentionally don't include the
libstdc++ headers, but instead provide minimal definitions of whatever they
need (e.g. placement new, the spaceship stuff, ...).

Jakub



Re: [Patch] OpenMP: Fixes for omp critical + hint

2020-07-22 Thread Jakub Jelinek via Gcc-patches
On Wed, Jul 22, 2020 at 11:25:47AM +0200, Tobias Burnus wrote:
> On 7/22/20 11:09 AM, Thomas Schwinge wrote:
> 
> > For build-tree testing, that'll pick up the *system* 'omp.h', thus:
> >  FAIL: c-c++-common/gomp/critical-hint-1.c (test for excess errors)
> >  Excess errors:
> >  [...]/c-c++-common/gomp/critical-hint-1.c:10:33: error: 
> > 'omp_sync_hint_none' undeclared (first use in this function); did you mean 
> > 'omp_lock_hint_none'?
> >  [...]
> 
> HEADER FILE:
> For "ISO_Fortran_binding.h", we use:
>   #include "../../../libgfortran/ISO_Fortran_binding.h"
> Can you check whether something like that also works for 'omp.h'?
> (Additional "../" and "libgomp/".)

This can't work, because unlike ISO_Fortran_binding.h, omp.h is
a generated file, so it doesn't appear in the source directory.

Jakub



Re: [Patch] OpenMP: Fixes for omp critical + hint

2020-07-22 Thread Tobias Burnus

On 7/22/20 11:09 AM, Thomas Schwinge wrote:


For build-tree testing, that'll pick up the *system* 'omp.h', thus:
 FAIL: c-c++-common/gomp/critical-hint-1.c (test for excess errors)
 Excess errors:
 [...]/c-c++-common/gomp/critical-hint-1.c:10:33: error: 
'omp_sync_hint_none' undeclared (first use in this function); did you mean 
'omp_lock_hint_none'?
 [...]


HEADER FILE:
For "ISO_Fortran_binding.h", we use:
  #include "../../../libgfortran/ISO_Fortran_binding.h"
Can you check whether something like that also works for 'omp.h'?
(Additional "../" and "libgomp/".)

Can you check whether that helps? For some reasons, those do
not fail here.


Similar:
 FAIL: gfortran.dg/gomp/critical-hint-1.f90   -O  (test for excess errors)
 Excess errors:
 [...]/gfortran.dg/gomp/critical-hint-1.f90:2:7: Fatal Error: Cannot open 
module file 'omp_lib.mod' for reading at (1): No such file or directory


FORTRAN MODULE
Then I have the question why, e.g., "use openacc_kinds" in
gfortran.dg/goacc/acc_on_device-2.f95 does work. What's different there?

The *exp file does not seem to be any different (except that goacc.exp
adds "dg-compile-aux-modules").


Tobias

-
Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München / Germany
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Alexander 
Walter


Re: [PATCH v2] dse: Remove partial load after full store for high part access[PR71309]

2020-07-22 Thread luoxhu via Gcc-patches
Hi,

On 2020/7/21 23:30, Richard Sandiford wrote:
> Xiong Hu Luo  writes:>> @@ -1872,9 +1872,27 @@ 
> get_stored_val (store_info *store_info, machine_mode read_mode,
>>   {
>> poly_int64 shift = gap * BITS_PER_UNIT;
>> poly_int64 access_size = GET_MODE_SIZE (read_mode) + gap;
>> -  read_reg = find_shift_sequence (access_size, store_info, read_mode,
>> -  shift, optimize_bb_for_speed_p (bb),
>> -  require_cst);
>> +  rtx rhs_subreg = NULL;
>> +
>> +  if (known_eq (GET_MODE_BITSIZE (store_mode), shift * 2))
>> +{
>> +  scalar_int_mode inner_mode = smallest_int_mode_for_size (shift);
>> +  poly_uint64 sub_off
>> += ((!BYTES_BIG_ENDIAN)
>> + ? GET_MODE_SIZE (store_mode) - GET_MODE_SIZE (inner_mode)
>> + : 0);
>> +
>> +  rhs_subreg = simplify_gen_subreg (inner_mode, store_info->rhs,
>> +store_mode, sub_off);
>> +  if (rhs_subreg)
>> +read_reg
>> +  = extract_low_bits (read_mode, inner_mode, copy_rtx (rhs_subreg));
>> +}
>> +
>> +  if (read_reg == NULL)
>> +read_reg
>> +  = find_shift_sequence (access_size, store_info, read_mode, shift,
>> + optimize_bb_for_speed_p (bb), require_cst);
> 
> Did you consider doing this in find_shift_sequence instead?
> ISTM that this is really using subregs to optimise:
> 
>/* In theory we could also check for an ashr.  Ian Taylor knows
>of one dsp where the cost of these two was not the same.  But
>this really is a rare case anyway.  */
>target = expand_binop (new_mode, lshr_optab, new_reg,
>gen_int_shift_amount (new_mode, shift),
>new_reg, 1, OPTAB_DIRECT);
> 
> I think everything up to:
> 
>/* Also try a wider mode if the necessary punning is either not
>desirable or not possible.  */
>if (!CONSTANT_P (store_info->rhs)
> && !targetm.modes_tieable_p (new_mode, store_mode))
>   continue;
> 
> is either neutral or helpful for the subreg case too, so maybe
> we could just add the optimisation after that.  (It probably isn't
> worth reusing any of the later loop body code though, since the
> subreg case is much simpler.)
> 
> I don't think we need to restrict this case to modes of size
> shift * 2.  We can just check whether the shift is a multiple of
> the new_mode calculated by find_shift_sequence (using multiple_p).
> 
> An easier way of converting the shift to a subreg byte offset
> is to use subreg_offset_from_lsb, which also handles
> BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN.
> 

Thanks, I've updated the patch by moving it into find_shift_sequence.
Not sure whether meets your comments precisely though it still works:)
There is a comment mentioned that 

  /* Some machines like the x86 have shift insns for each size of
 operand.  Other machines like the ppc or the ia-64 may only have
 shift insns that shift values within 32 or 64 bit registers.
 This loop tries to find the smallest shift insn that will right
 justify the value we want to read but is available in one insn on
 the machine.  */

So it will early break without some additional check as the new_mode is
TImode here:

  if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
break;



[PATCH v2] dse: Remove partial load after full store for high part 
access[PR71309]


This patch could optimize (works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
18: r122:TI=r119:TI
16: r123:TI#0=r122:TI#8 0>>0
17: r123:TI#8=0
19: r124:DI=r123:TI#0
7: [r118:DI]=r119:TI
8: r121:DI=r124:DI

Final ASM will be as below without partial load after full store(stxv+ld):
  mr 9,3
  ld 3,24(3)
  ld 10,16(3)
  std 3,8(9)
  std 10,0(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression testing on Power9-LE.

For AArch64, one ldr is replaced by mov:

ldp x2, x3, [x0, 16]
stp x2, x3, [x0]
ldr x0, [x0, 8]

=>

mov x1, x0
ldp x2, x0, [x0, 16]
stp x2, x0, [x1]

gcc/ChangeLog:

2020-07-22  Xionghu Luo  

PR rtl-optimization/71309
* dse.c (find_shift_sequence): Use subreg of shifted from high part
register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-22  Xionghu Luo  

PR rtl-optimization/71309
* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c  | 15 +-
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++
 2 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..e06a9fbb0cd 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1736,7 +1736,8 @@ find_shift_sequence (poly_int64 

Re: [PATCH] doc: fix a typo in languages.texi

2020-07-22 Thread Richard Sandiford
Wei Wentao  writes:
> hi,
> This patch fix a typo in languages.texi.

Thanks for the patch, pushed to master.

Richard

>
> Regards!
> weiwt
>
> ---
>  gcc/doc/languages.texi | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/gcc/doc/languages.texi b/gcc/doc/languages.texi
> index 70641482a1d..c6144f253c5 100644
> --- a/gcc/doc/languages.texi
> +++ b/gcc/doc/languages.texi
> @@ -21,7 +21,7 @@ GCC, has several advantages:
>  target machines already present in GCC@.
>  @item GCC front ends benefit from all the optimizations in GCC@.  Some
>  of these, such as alias analysis, may work better when GCC is
> -compiling directly from source code then when it is compiling from
> +compiling directly from source code than when it is compiling from
>  generated C code.
>  @item Better debugging information is generated when compiling
>  directly from source code than when going via intermediate generated C


Re: [Patch] OpenMP: Fixes for omp critical + hint

2020-07-22 Thread Jakub Jelinek via Gcc-patches
On Wed, Jul 22, 2020 at 11:09:06AM +0200, Thomas Schwinge wrote:
> So I suppose you'll either have to put these testcases into 'libgomp', or
> we'll have to invent something else?

Indeed.

> Jakub, is there a reason why for
> build-tree testing we can't just add '-I[build-tree]/libgomp' etc. in
> 'gcc.dg/gomp/gomp.exp' etc.?

I guess historic reasons.  E.g. g++.dg/ adds those and -L for libstdc++ too,
but then most of the C++ tests that test primarily the compiler and
sometimes use headers and even more often the runtime library are there.
On the gomp side, libgomp/testsuite has been used for both compile and
link/runtime tests that need the runtime library and its headers, while
gcc/testsuite/*/gomp/ has been left for tests that don't need any of those.

Jakub



[committed] MAINTAINERS: Add myself for write after approval

2020-07-22 Thread Przemyslaw Wirkus
ChangeLog:

2020-07-22  Przemyslaw Wirkus  

* MAINTAINERS (Write After Approval): Add myself.

---

diff --git a/MAINTAINERS b/MAINTAINERS
index 
d1343d33f1abb4a4bec7deac6c86551b83ecbdf1..300c10edf196d5698ea0fb0f8ee6a8f50a642292
 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -645,6 +645,7 @@ Mark Wielaard   

 Edmar Wienskoski   
 Ollie Wild 
 Kevin Williams 
+Przemyslaw Wirkus  
 Carlo Wood 
 Jackson Woodruff   
 Mingjie Xing   


Re: [PATCH v2] vect/rs6000: Support vector with length cost modeling

2020-07-22 Thread Richard Sandiford
"Kewen.Lin"  writes:
> Hi Richard,
>
> on 2020/7/21 下午3:57, Richard Biener wrote:
>> On Tue, Jul 21, 2020 at 7:52 AM Kewen.Lin  wrote:
>>>
>>> Hi,
>>>
>>> This patch is to add the cost modeling for vector with length,
>>> it mainly follows what we generate for vector with length in
>>> functions vect_set_loop_controls_directly and vect_gen_len
>>> at the worst case.
>>>
>>> For Power, the length is expected to be in bits 0-7 (high bits),
>>> we have to model the cost of shifting bits.  To allow other targets
>>> not suffer this, I used one target hook to describe this extra cost,
>>> I'm not sure if it's a correct way.
>>>
>>> Bootstrapped/regtested on powerpc64le-linux-gnu (P9) with explicit
>>> param vect-partial-vector-usage=1.
>>>
>>> Any comments/suggestions are highly appreciated!
>> 
>> I don't like the introduction of an extra target hook for this.  All
>> vectorizer cost modeling should ideally go through
>> init_cost/add_stmt_cost/finish_cost.  If the extra costing is
>> not per stmt then either init_cost or finish_cost is appropriate.
>> Currently init_cost only gets a struct loop while we should
>> probably give it a vec_info * parameter so targets can
>> check LOOP_VINFO_USING_PARTIAL_VECTORS_P and friends.
>> 
>
> Thanks!  Nice, your suggested way looks better.  I've removed the hook
> and taken care of it in finish_cost.  The updated v2 is attached.
>
> Bootstrapped/regtested again on powerpc64le-linux-gnu (P9) with explicit
> param vect-partial-vector-usage=1.
>
> BR,
> Kewen
> -
> gcc/ChangeLog:
>
>   * config/rs6000/rs6000.c (adjust_vect_cost): New function.
>   (rs6000_finish_cost): Call function adjust_vect_cost.
>   * tree-vect-loop.c (vect_estimate_min_profitable_iters): Add cost
>   modeling for vector with length.
>
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index 5a4f07d5810..f2724e792c9 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -5177,6 +5177,34 @@ rs6000_add_stmt_cost (class vec_info *vinfo, void 
> *data, int count,
>return retval;
>  }
>  
> +/* For some target specific vectorization cost which can't be handled per 
> stmt,
> +   we check the requisite conditions and adjust the vectorization cost
> +   accordingly if satisfied.  One typical example is to model shift cost for
> +   vector with length by counting number of required lengths under condition
> +   LOOP_VINFO_FULLY_WITH_LENGTH_P.  */
> +
> +static void
> +adjust_vect_cost (rs6000_cost_data *data)
> +{
> +  struct loop *loop = data->loop_info;
> +  gcc_assert (loop);
> +  loop_vec_info loop_vinfo = loop_vec_info_for_loop (loop);
> +
> +  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> +{
> +  rgroup_controls *rgc;
> +  unsigned int num_vectors_m1;
> +  unsigned int shift_cnt = 0;
> +  FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
> + if (rgc->type)
> +   /* Each length needs one shift to fill into bits 0-7.  */
> +   shift_cnt += (num_vectors_m1 + 1);
> +
> +  rs6000_add_stmt_cost (loop_vinfo, (void *) data, shift_cnt, 
> scalar_stmt,
> + NULL, NULL_TREE, 0, vect_body);
> +}
> +}
> +
>  /* Implement targetm.vectorize.finish_cost.  */
>  
>  static void
> @@ -5186,7 +5214,10 @@ rs6000_finish_cost (void *data, unsigned 
> *prologue_cost,
>rs6000_cost_data *cost_data = (rs6000_cost_data*) data;
>  
>if (cost_data->loop_info)
> -rs6000_density_test (cost_data);
> +{
> +  adjust_vect_cost (cost_data);
> +  rs6000_density_test (cost_data);
> +}
>  
>/* Don't vectorize minimum-vectorization-factor, simple copy loops
>   that require versioning for any reason.  The vectorization is at
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index e933441b922..99e1fd7bdd0 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -3652,7 +3652,7 @@ vect_estimate_min_profitable_iters (loop_vec_info 
> loop_vinfo,
>   TODO: Build an expression that represents peel_iters for prologue and
>   epilogue to be used in a run-time test.  */
>  
> -  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +  if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
>  {
>peel_iters_prologue = 0;
>peel_iters_epilogue = 0;
> @@ -3663,45 +3663,145 @@ vect_estimate_min_profitable_iters (loop_vec_info 
> loop_vinfo,
> peel_iters_epilogue += 1;
> stmt_info_for_cost *si;
> int j;
> -   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
> - j, si)
> +   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j,
> + si)
>   (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
> si->kind, si->stmt_info, si->vectype,
> si->misalign, vect_epilogue);
>   }
>  
> -  /* Calculate how many masks we need to generate.  */

Re: [Patch] OpenMP: Fixes for omp critical + hint

2020-07-22 Thread Thomas Schwinge
Hi Tobias!

On 2020-07-21T17:43:00+0200, Tobias Burnus  wrote:
> On 7/21/20 2:18 PM, Jakub Jelinek wrote:
>> [...] shows that we don't really handle critical in such case
>> correctly, because nothing will then try to verify the value after
>> instantiation.
>
> In principle, the compiler is not required to diagnose all invalid code ...

Sure, but we should at least make an attempt to implement consistent
checking for reasonably similar cases that we know about requiring
separate/similar handling, such as non-templated vs. templated C++.

> In any case, I have now implemented it now also in pt.c.

Thanks.


> --- /dev/null
> +++ b/gcc/testsuite/c-c++-common/gomp/critical-hint-1.c
> @@ -0,0 +1,47 @@
> +#include 

For build-tree testing, that'll pick up the *system* 'omp.h', thus:

FAIL: c-c++-common/gomp/critical-hint-1.c (test for excess errors)
Excess errors:
[...]/c-c++-common/gomp/critical-hint-1.c:10:33: error: 
'omp_sync_hint_none' undeclared (first use in this function); did you mean 
'omp_lock_hint_none'?
[...]

> --- /dev/null
> +++ b/gcc/testsuite/c-c++-common/gomp/critical-hint-2.c
> @@ -0,0 +1,36 @@
> +/* { dg-additional-options "-fdump-tree-original" } */
> +#include 

Likewise.

> --- /dev/null
> +++ b/gcc/testsuite/gfortran.dg/gomp/critical-hint-1.f90
> @@ -0,0 +1,94 @@
> +subroutine example_criticial ()
> +  use omp_lib

Similar:

FAIL: gfortran.dg/gomp/critical-hint-1.f90   -O  (test for excess errors)
Excess errors:
[...]/gfortran.dg/gomp/critical-hint-1.f90:2:7: Fatal Error: Cannot open 
module file 'omp_lib.mod' for reading at (1): No such file or directory

> --- /dev/null
> +++ b/gcc/testsuite/gfortran.dg/gomp/critical-hint-2.f90
> @@ -0,0 +1,65 @@
> +! { dg-additional-options "-fdump-tree-original" }
> +subroutine example_criticial ()
> +  use omp_lib

Likewise.

So I suppose you'll either have to put these testcases into 'libgomp', or
we'll have to invent something else?  Jakub, is there a reason why for
build-tree testing we can't just add '-I[build-tree]/libgomp' etc. in
'gcc.dg/gomp/gomp.exp' etc.?


Grüße
 Thomas

-
Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München / Germany
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Alexander 
Walter


[PATCH] PR target/96260 - KASAN should work even back-end not porting anything.

2020-07-22 Thread Kito Cheng
 - Most KASAN function don't need any porting anything in back-end
   except asan stack protection.

 - However kernel will given shadow offset when enable asan stack
   protection, so eveything in KASAN can work if shadow offset is given.

 - Verified with x86 and risc-v.

 - Verified with RISC-V linux kernel.

OK for trunk and GCC 10 branch?

gcc/ChangeLog:

PR target/96260
* asan.c (asan_shadow_offset_set_p): New.
* asan.h (asan_shadow_offset_set_p): Ditto.
* toplev.c (process_options): Allow -fsanitize=kernel-address
even TARGET_ASAN_SHADOW_OFFSET not implemented, only check when
asan stack protection is enabled.

gcc/testsuite/ChangeLog:

PR target/96260
* gcc.target/riscv/pr91441.c: Update warning message.
* gcc.target/riscv/pr96260.c: New.
---
 gcc/asan.c   |  6 ++
 gcc/asan.h   |  2 ++
 gcc/testsuite/gcc.target/riscv/pr91441.c |  2 +-
 gcc/testsuite/gcc.target/riscv/pr96260.c |  9 +
 gcc/toplev.c | 21 -
 5 files changed, 38 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/pr96260.c

diff --git a/gcc/asan.c b/gcc/asan.c
index 9c9aa4cae358..2e759540246f 100644
--- a/gcc/asan.c
+++ b/gcc/asan.c
@@ -344,6 +344,12 @@ asan_shadow_offset ()
   return asan_shadow_offset_value;
 }
 
+/* Returns Asan shadow offset has been set.  */
+bool asan_shadow_offset_set_p ()
+{
+  return asan_shadow_offset_computed;
+}
+
 alias_set_type asan_shadow_set = -1;
 
 /* Pointer types to 1, 2 or 4 byte integers in shadow memory.  A separate
diff --git a/gcc/asan.h b/gcc/asan.h
index 9efd33f9b86b..114b457ef91c 100644
--- a/gcc/asan.h
+++ b/gcc/asan.h
@@ -129,6 +129,8 @@ asan_var_and_redzone_size (unsigned HOST_WIDE_INT size)
 
 extern bool set_asan_shadow_offset (const char *);
 
+extern bool asan_shadow_offset_set_p ();
+
 extern void set_sanitized_sections (const char *);
 
 extern bool asan_sanitize_stack_p (void);
diff --git a/gcc/testsuite/gcc.target/riscv/pr91441.c 
b/gcc/testsuite/gcc.target/riscv/pr91441.c
index 593a2972a0f0..2403c98bb703 100644
--- a/gcc/testsuite/gcc.target/riscv/pr91441.c
+++ b/gcc/testsuite/gcc.target/riscv/pr91441.c
@@ -7,4 +7,4 @@ int *f( int a)
 {
   return bar();
 }
-/* { dg-warning ".'-fsanitize=address' and '-fsanitize=kernel-address' are not 
supported for this target" "" { target *-*-* } 0 } */
+/* { dg-warning ".'-fsanitize=kernel-address' with stack protection is not 
supported without '-fasan-shadow-offset=' for this target." "" { target *-*-* } 
0 } */
diff --git a/gcc/testsuite/gcc.target/riscv/pr96260.c 
b/gcc/testsuite/gcc.target/riscv/pr96260.c
new file mode 100644
index ..229997f877b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/pr96260.c
@@ -0,0 +1,9 @@
+/* PR target/96260 */
+/* { dg-do compile } */
+/* { dg-options "--param asan-stack=1 -fsanitize=kernel-address 
-fasan-shadow-offset=0x10" } */
+
+int *bar(int *);
+int *f( int a)
+{
+  return bar();
+}
diff --git a/gcc/toplev.c b/gcc/toplev.c
index 95eea63380f6..48f13d282c52 100644
--- a/gcc/toplev.c
+++ b/gcc/toplev.c
@@ -1835,7 +1835,7 @@ process_options (void)
   /* Address Sanitizer needs porting to each target architecture.  */
 
   if ((flag_sanitize & SANITIZE_ADDRESS)
-  && (!FRAME_GROWS_DOWNWARD || targetm.asan_shadow_offset == NULL))
+  && !FRAME_GROWS_DOWNWARD)
 {
   warning_at (UNKNOWN_LOCATION, 0,
  "%<-fsanitize=address%> and %<-fsanitize=kernel-address%> "
@@ -1843,6 +1843,25 @@ process_options (void)
   flag_sanitize &= ~SANITIZE_ADDRESS;
 }
 
+  if ((flag_sanitize & SANITIZE_USER_ADDRESS)
+  && targetm.asan_shadow_offset == NULL)
+{
+  warning_at (UNKNOWN_LOCATION, 0,
+ "%<-fsanitize=address%> not supported for this target");
+  flag_sanitize &= ~SANITIZE_ADDRESS;
+}
+
+  if ((flag_sanitize & SANITIZE_KERNEL_ADDRESS)
+  && (targetm.asan_shadow_offset == NULL && param_asan_stack
+ && !asan_shadow_offset_set_p ()))
+{
+  warning_at (UNKNOWN_LOCATION, 0,
+ "%<-fsanitize=kernel-address%> with stack protection "
+ "is not supported without %<-fasan-shadow-offset=%> "
+ "for this target.");
+  flag_sanitize &= ~SANITIZE_ADDRESS;
+}
+
  /* Do not use IPA optimizations for register allocation if profiler is active
 or patchable function entries are inserted for run-time instrumentation
 or port does not emit prologue and epilogue as RTL.  */
-- 
2.27.0



RE: [PATCH][GCC][aarch64] Generation of adjusted ldp/stp for vector types

2020-07-22 Thread Przemyslaw Wirkus
[snip...]

> Przemek, if you don't have commit access already, please follow the steps on
> https://gcc.gnu.org/gitwrite.html (happy to sponsor).

Done.

Thank you, Richard, for sponsoring this and all the support!

Kind regards, 
Przemek



[PATCH] libgccjit: Improve doc and comments regarding type casts (Was: Clarifying the permitted type conversions)

2020-07-22 Thread Andrea Corallo
Alex Coplan  writes:

> Secondly, I wanted to clarify the situation with respect to explicit
> casts; that is, those conversions allowed by gcc_jit_context_new_cast().
> The docs [0] say:
>
> Currently only a limited set of conversions are possible:
>  - int <-> float
>  - int <-> bool
>  - P* <-> Q*, for pointer types P and Q
>
> However, empirically (at least on aarch64), libgccjit appears to allow
> me to compile casts between any pair of types in the following set
> without any complaint:
>
> {
>   SIGNED_CHAR,
>   UNSIGNED_CHAR,
>   SHORT,
>   UNSIGNED_SHORT,
>   INT,
>   UNSIGNED_INT,
>   LONG,
>   UNSIGNED_LONG,
>   LONG_LONG,
>   UNSIGNED_LONG_LONG
> }

Hi Alex,

Looking at the code I believe all these casts are meant to be supported
(read your intuition was correct).

Also IMO source of confusion is that the doc is mentioning 'int' and
'float' but I believe would be better to have like 'integral' and
'floating-point' to clearly disambiguates with respect to the C
types.

AFAIU the set of supported casts should be like:

 integral   <-> integral
 floating-point <-> floating-point
 integral   <-> floating-point
 integral   <-> bool
 P* <-> Q*   for pointer types P and Q.

I'd propose to install the following patch to make doc and comments
homogeneous at documenting what do we accept, and I guess we should just
consider bugs if some of these conversions is not handled correctly or
leads to ICE.

Bests

  Andrea

gcc/jit/ChangeLog

2020-07-21  Andrea Corallo  

* docs/_build/texinfo/libgccjit.texi (Type-coercion): Improve doc
on allowed type casting.
* docs/topics/expressions.rst (gccjit::context::new_cast)
(gcc_jit_context_new_cast): Likewise.
* libgccjit.c: Improve comment on allowed type casting.
* libgccjit.h: Likewise

>From 914b9e86808c947d4bb2b06c6960fd8031125f67 Mon Sep 17 00:00:00 2001
From: Andrea Corallo 
Date: Tue, 21 Jul 2020 20:12:23 +0200
Subject: [PATCH] libgccjit: improve documentation on type conversions

gcc/jit/ChangeLog

2020-07-21  Andrea Corallo  

	* docs/_build/texinfo/libgccjit.texi (Type-coercion): Improve doc
	on allowed type casting.
	* docs/topics/expressions.rst (gccjit::context::new_cast)
	(gcc_jit_context_new_cast): Likewise.
	* libgccjit.c: Improve comment on allowed type casting.
	* libgccjit.h: Likewise
---
 gcc/jit/docs/_build/texinfo/libgccjit.texi | 30 +++---
 gcc/jit/docs/topics/expressions.rst|  8 +++---
 gcc/jit/libgccjit.c|  8 +++---
 gcc/jit/libgccjit.h|  7 +++--
 4 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/gcc/jit/docs/_build/texinfo/libgccjit.texi b/gcc/jit/docs/_build/texinfo/libgccjit.texi
index 1e14be010426..b170f24d1bb1 100644
--- a/gcc/jit/docs/_build/texinfo/libgccjit.texi
+++ b/gcc/jit/docs/_build/texinfo/libgccjit.texi
@@ -6685,13 +6685,19 @@ Currently only a limited set of conversions are possible:
 @itemize *
 
 @item 
-int <-> float
+integral   <-> integral
 
 @item 
-int <-> bool
+floating-point <-> floating-point
 
 @item 
-P*  <-> Q*, for pointer types P and Q
+integral   <-> floating-point
+
+@item 
+integral   <-> bool
+
+@item 
+P* <-> Q*   for pointer types P and Q
 @end itemize
 @end quotation
 @end deffn
@@ -12964,14 +12970,20 @@ Currently only a limited set of conversions are possible:
 
 @itemize *
 
-@item 
-int <-> float
+@item
+integral   <-> integral
 
-@item 
-int <-> bool
+@item
+floating-point <-> floating-point
 
-@item 
-P*  <-> Q*, for pointer types P and Q
+@item
+integral   <-> floating-point
+
+@item
+integral   <-> bool
+
+@item
+P* <-> Q*, for pointer types P and Q
 @end itemize
 @end quotation
 @end deffn
diff --git a/gcc/jit/docs/topics/expressions.rst b/gcc/jit/docs/topics/expressions.rst
index d783ceea51a8..051cee5db211 100644
--- a/gcc/jit/docs/topics/expressions.rst
+++ b/gcc/jit/docs/topics/expressions.rst
@@ -504,9 +504,11 @@ Type-coercion
 
Currently only a limited set of conversions are possible:
 
- * int <-> float
- * int <-> bool
- * P*  <-> Q*, for pointer types P and Q
+ * integral   <-> integral
+ * floating-point <-> floating-point
+ * integral   <-> floating-point
+ * integral   <-> bool
+ * P* <-> Q*   for pointer types P and Q
 
 Lvalues
 ---
diff --git a/gcc/jit/libgccjit.c b/gcc/jit/libgccjit.c
index 3d04f6db3aff..403233d5577a 100644
--- a/gcc/jit/libgccjit.c
+++ b/gcc/jit/libgccjit.c
@@ -1629,9 +1629,11 @@ gcc_jit_context_new_call_through_ptr (gcc_jit_context *ctxt,
 
We only permit these kinds of cast:
 
- int <-> float
- int <-> bool
- P*  <-> Q*   for pointer types P and Q.  */
+ integral   <-> integral
+ floating-point <-> floating-point
+ integral   <-> floating-point
+ integral   <-> bool
+ P* <-> Q*   for pointer types P and Q.  */
 
 

Re: [PATCH 2/3] expmed: Fix possible use of NULL_RTX return value from emit_store_flag

2020-07-22 Thread Richard Sandiford
Jozef Lawrynowicz  writes:
> diff --git a/gcc/expmed.c b/gcc/expmed.c
> index e7c03fbf92c..d3a1735d39e 100644
> --- a/gcc/expmed.c
> +++ b/gcc/expmed.c
> @@ -4086,9 +4086,12 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx op0, 
> HOST_WIDE_INT d)
>  {
>temp = gen_reg_rtx (mode);
>temp = emit_store_flag (temp, LT, op0, const0_rtx, mode, 0, 1);
> -  temp = expand_binop (mode, add_optab, temp, op0, NULL_RTX,
> -0, OPTAB_LIB_WIDEN);
> -  return expand_shift (RSHIFT_EXPR, mode, temp, logd, NULL_RTX, 0);
> +  if (temp != NULL_RTX)
> + {
> +   temp = expand_binop (mode, add_optab, temp, op0, NULL_RTX,
> +0, OPTAB_LIB_WIDEN);
> +   return expand_shift (RSHIFT_EXPR, mode, temp, logd, NULL_RTX, 0);
> + }
>  }
>  
>if (HAVE_conditional_move
> @@ -4122,17 +4125,20 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx op0, 
> HOST_WIDE_INT d)
>  
>temp = gen_reg_rtx (mode);
>temp = emit_store_flag (temp, LT, op0, const0_rtx, mode, 0, -1);
> -  if (GET_MODE_BITSIZE (mode) >= BITS_PER_WORD
> -   || shift_cost (optimize_insn_for_speed_p (), mode, ushift)
> -  > COSTS_N_INSNS (1))
> - temp = expand_binop (mode, and_optab, temp, gen_int_mode (d - 1, mode),
> -  NULL_RTX, 0, OPTAB_LIB_WIDEN);
> -  else
> - temp = expand_shift (RSHIFT_EXPR, mode, temp,
> -  ushift, NULL_RTX, 1);
> -  temp = expand_binop (mode, add_optab, temp, op0, NULL_RTX,
> -0, OPTAB_LIB_WIDEN);
> -  return expand_shift (RSHIFT_EXPR, mode, temp, logd, NULL_RTX, 0);
> +  if (temp != NULL_RTX)
> + {
> +   if (GET_MODE_BITSIZE (mode) >= BITS_PER_WORD
> +   || shift_cost (optimize_insn_for_speed_p (), mode, ushift)
> +   > COSTS_N_INSNS (1))
> + temp = expand_binop (mode, and_optab, temp, gen_int_mode (d - 1, 
> mode),

Long line.

OK otherwise, thanks.  I guess these failed attempts will leave
a few unused temporary registers around (from the gen_reg_rtxes)
but it's going to be hard to avoid that in a clean way.

Richard

> +  NULL_RTX, 0, OPTAB_LIB_WIDEN);
> +   else
> + temp = expand_shift (RSHIFT_EXPR, mode, temp,
> +  ushift, NULL_RTX, 1);
> +   temp = expand_binop (mode, add_optab, temp, op0, NULL_RTX,
> +0, OPTAB_LIB_WIDEN);
> +   return expand_shift (RSHIFT_EXPR, mode, temp, logd, NULL_RTX, 0);
> + }
>  }
>  
>label = gen_label_rtx ();


Re: committed] correct memcmp expansion of constant representations containing embedded nuls (PR 95189)

2020-07-22 Thread Iain Sandoe

Hi Martin,

Rainer Orth  wrote:


I have committed this change in r11-2231 after Jeff approved it
off list last Thursday.


the new gcc.target/i386/memcpy-pr95886.c test FAILs on 32-bit x86
(i386-pc-solaris2.11):

+FAIL: gcc.target/i386/memcpy-pr95886.c scan-rtl-dump-times expand  
"const_int 1976943448883713" 1
+FAIL: gcc.target/i386/memcpy-pr95886.c scan-rtl-dump-times expand  
"const_int 576467370915332609" 1
+FAIL: gcc.target/i386/memcpy-pr95886.c scan-rtl-dump-times expand  
"const_int 578431098682540545" 1
+FAIL: gcc.target/i386/memcpy-pr95886.c scan-rtl-dump-times expand  
"const_int 578437695685198337" 1
+FAIL: gcc.target/i386/memcpy-pr95886.c scan-rtl-dump-times expand  
"const_int 578437695685198337" 1
+FAIL: gcc.target/i386/memcpy-pr95886.c scan-rtl-dump-times expand  
"const_int 578437695752110593" 1
+FAIL: gcc.target/i386/memcpy-pr95886.c scan-rtl-dump-times expand  
"const_int 578437695752306689" 1
+FAIL: gcc.target/i386/memcpy-pr95886.c scan-rtl-dump-times expand  
"const_int 578437695752307200" 1
+FAIL: gcc.target/i386/memcpy-pr95886.c scan-rtl-dump-times expand  
"const_int 578437695752307201” 2


likewise, the tests fail on m32 Darwin,
thanks

Iain



Re: [PATCH 1/3] expr: Allow scalar_int_mode target mode when converting a constant

2020-07-22 Thread Richard Sandiford
Jozef Lawrynowicz  writes:
> is_int_mode does not allow MODE_PARTIAL_INT modes, so convert_modes was
> not allowing a constant value to be converted to a MODE_PARTIAL_INT for
> use as operand 2 in patterns such as ashlpsi3. The constant had
> to be copied into a register before it could be used, but now can be
> used directly as an operand without any copying.

Yeah.  I guess this dates back to when MODE_PARTIAL_INTs didn't have
a known precision.

> diff --git a/gcc/expr.c b/gcc/expr.c
> index c7c3e9fd655..5a252f0935f 100644
> --- a/gcc/expr.c
> +++ b/gcc/expr.c
> @@ -696,7 +696,7 @@ convert_modes (machine_mode mode, machine_mode oldmode, 
> rtx x, int unsignedp)
>  return x;
>  
>if (CONST_SCALAR_INT_P (x)
> -  && is_int_mode (mode, _mode))
> +  && is_a  (mode, _mode))
>  {
>/* If the caller did not tell us the old mode, then there is not
>much to do with respect to canonicalization.  We have to

I think we also need to change the condition in:

  /* If the caller did not tell us the old mode, then there is not
 much to do with respect to canonicalization.  We have to
 assume that all the bits are significant.  */
  if (GET_MODE_CLASS (oldmode) != MODE_INT)

to is_a  (old_mode)

OK with that, thanks.

Richard


Re: [PATCH] rs6000: Rename adjust_vectorization_cost

2020-07-22 Thread Segher Boessenkool
Hi!

On Wed, Jul 22, 2020 at 09:44:52AM +0800, Kewen.Lin wrote:
> This trivial patch is to rename adjust_vectorization_cost to 
> adjust_vect_cost_per_stmt.  Hope it's more meaningful, as well
> as to avoid the confusion between the possible to be landed
> function "adjust_vect_cost" and "adjust_vectorization_cost".
> 
> Even without "adjust_vect_cost", I guess it's still good?

It is an improvement for sure, so it is okay for trunk of course.  It
still isn't very clear from the name how this would differ from
adjust_vect_cost, but that _is_ obviously the more generic name, so
that is good.

Thanks,


Segher


Re: committed] correct memcmp expansion of constant representations containing embedded nuls (PR 95189)

2020-07-22 Thread Rainer Orth
Hi Martin,

> I have committed this change in r11-2231 after Jeff approved it
> off list last Thursday.

the new gcc.target/i386/memcpy-pr95886.c test FAILs on 32-bit x86
(i386-pc-solaris2.11):

+FAIL: gcc.target/i386/memcpy-pr95886.c scan-rtl-dump-times expand "const_int 
1976943448883713" 1
+FAIL: gcc.target/i386/memcpy-pr95886.c scan-rtl-dump-times expand "const_int 
576467370915332609" 1
+FAIL: gcc.target/i386/memcpy-pr95886.c scan-rtl-dump-times expand "const_int 
578431098682540545" 1
+FAIL: gcc.target/i386/memcpy-pr95886.c scan-rtl-dump-times expand "const_int 
578437695685198337" 1
+FAIL: gcc.target/i386/memcpy-pr95886.c scan-rtl-dump-times expand "const_int 
578437695685198337" 1
+FAIL: gcc.target/i386/memcpy-pr95886.c scan-rtl-dump-times expand "const_int 
578437695752110593" 1
+FAIL: gcc.target/i386/memcpy-pr95886.c scan-rtl-dump-times expand "const_int 
578437695752306689" 1
+FAIL: gcc.target/i386/memcpy-pr95886.c scan-rtl-dump-times expand "const_int 
578437695752307200" 1
+FAIL: gcc.target/i386/memcpy-pr95886.c scan-rtl-dump-times expand "const_int 
578437695752307201" 2

Rainer

-- 
-
Rainer Orth, Center for Biotechnology, Bielefeld University


[PATCH] Using gen_int_mode instead of GEN_INT to avoid ICE caused by type promotion.

2020-07-22 Thread Hongtao Liu via Gcc-patches
  Bootstrap is ok, regression test is ok for i386 backend.

gcc/
PR target/96262
* config/i386/i386-expand.c
(ix86_expand_vec_shift_qihi_constant): Refine.

gcc/testsuite/
* gcc.target/i386/pr96262-1.c: New test.

---
 gcc/config/i386/i386-expand.c |  6 +++---
 gcc/testsuite/gcc.target/i386/pr96262-1.c | 11 +++
 2 files changed, 14 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96262-1.c

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index e194214804b..d57d043106a 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -19537,7 +19537,7 @@ bool
 ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest,
rtx op1, rtx op2)
 {
   machine_mode qimode, himode;
-  unsigned int and_constant, xor_constant;
+  HOST_WIDE_INT and_constant, xor_constant;
   HOST_WIDE_INT shift_amount;
   rtx vec_const_and, vec_const_xor;
   rtx tmp, op1_subreg;
@@ -19612,7 +19612,7 @@ ix86_expand_vec_shift_qihi_constant (enum
rtx_code code, rtx dest, rtx op1, rtx
   emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
   emit_move_insn (vec_const_and,
  ix86_build_const_vector (qimode, true,
-  GEN_INT (and_constant)));
+  gen_int_mode (and_constant,
QImode)));
   emit_insn (gen_and (dest, dest, vec_const_and));

   /* For ASHIFTRT, perform extra operation like
@@ -19623,7 +19623,7 @@ ix86_expand_vec_shift_qihi_constant (enum
rtx_code code, rtx dest, rtx op1, rtx
   vec_const_xor = gen_reg_rtx (qimode);
   emit_move_insn (vec_const_xor,
  ix86_build_const_vector (qimode, true,
-  GEN_INT (xor_constant)));
+  gen_int_mode
(xor_constant, QImode)));
   emit_insn (gen_xor (dest, dest, vec_const_xor));
   emit_insn (gen_sub (dest, dest, vec_const_xor));
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr96262-1.c
b/gcc/testsuite/gcc.target/i386/pr96262-1.c
new file mode 100644
index 000..1825388072e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96262-1.c
@@ -0,0 +1,11 @@
+/* PR target/96262 */
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -O" } */
+
+typedef char __attribute__ ((__vector_size__ (64))) V;
+
+V
+foo (V v)
+{
+  return ~(v << 1);
+}
-- 

-- 
BR,
Hongtao


Re: [PATCH v2] vect/rs6000: Support vector with length cost modeling

2020-07-22 Thread Kewen.Lin via Gcc-patches
Hi Richard,

on 2020/7/22 下午2:38, Richard Biener wrote:
> On Wed, Jul 22, 2020 at 3:26 AM Kewen.Lin  wrote:
>>
>> Hi Richard,
>>
>> on 2020/7/21 下午3:57, Richard Biener wrote:
>>> On Tue, Jul 21, 2020 at 7:52 AM Kewen.Lin  wrote:

 Hi,

 This patch is to add the cost modeling for vector with length,
 it mainly follows what we generate for vector with length in
 functions vect_set_loop_controls_directly and vect_gen_len
 at the worst case.

 For Power, the length is expected to be in bits 0-7 (high bits),
 we have to model the cost of shifting bits.  To allow other targets
 not suffer this, I used one target hook to describe this extra cost,
 I'm not sure if it's a correct way.

 Bootstrapped/regtested on powerpc64le-linux-gnu (P9) with explicit
 param vect-partial-vector-usage=1.

 Any comments/suggestions are highly appreciated!
>>>
>>> I don't like the introduction of an extra target hook for this.  All
>>> vectorizer cost modeling should ideally go through
>>> init_cost/add_stmt_cost/finish_cost.  If the extra costing is
>>> not per stmt then either init_cost or finish_cost is appropriate.
>>> Currently init_cost only gets a struct loop while we should
>>> probably give it a vec_info * parameter so targets can
>>> check LOOP_VINFO_USING_PARTIAL_VECTORS_P and friends.
>>>
>>
>> Thanks!  Nice, your suggested way looks better.  I've removed the hook
>> and taken care of it in finish_cost.  The updated v2 is attached.
>>
>> Bootstrapped/regtested again on powerpc64le-linux-gnu (P9) with explicit
>> param vect-partial-vector-usage=1.
> 
> LGTM (with assuming the first larger hunk is mostly re-indenting
> under LOOP_VINFO_USING_PARTIAL_VECTORS_P).

Thanks for the review!  Yes, for the original LOOP_VINFO_FULLY_MASKED_P
hunk, this patch moves the handling of gap peeling to be shared between
mask and length, and re-indent the remaining (masking specific) into inner
LOOP_VINFO_FULLY_MASKED_P.  The length specific is put into the else hunk.
It wouldn't change anything for masking, I'll run aarch64 regtesting to
ensure it.  :)

BR,
Kewen


Re: [PATCH PR96053] Add "#pragma GCC no_reduc_chain"

2020-07-22 Thread Richard Biener
On Wed, 22 Jul 2020, zhoukaipeng (A) wrote:

> Hi,
> 
> It is the patch to add "#pragma GCC no_reduc_chain" for pr96053.  It 
> only completes the front end of C language.
> 
> For the testcase, it successfully skipped doing slp by finding sequences 
> from reduction chains.  Without "#pragma GCC no_reduc_chain", it will 
> fail to do vectorization.
> 
> Please help to check if there is any problem.  If there is no problem, I 
> will continue to complete the front end of the remaining languages.

First of all I think giving users more control over vectorization is
good.  Now as for "#pragma GCC no_reduc_chain" I'd like to avoid
negatives and terms internal to GCC.  I also would like to see
vectorization pragmas to be grouped somehow, also to avoid bit
explosion in struct loop.  There's already annot_expr_no_vector_kind
and annot_expr_vector_kind both only used by the fortran FE at
the moment.  Note ANNOATE_EXPR already allows an extra argument
thus only annot_expr_vector_kind should prevail with its argument
specifying a bitmask of vectorizer hints.  We'd have an extra
enum for those like

enum annot_vector_subkind {
  annot_vector_never = 0,
  annot_vector_auto = 1, // this is the default
  annot_vector_always = 3,
  your new flag
};

and the user would specify it via

#pragma GCC vect [(never|always|auto)] [your new flag]

now, I honestly have a difficulty in suggesting a better name
than no_reduc_chain.  Quoting the testcase:

+double f(double *a, double *b)
+{
+  double res1 = 0;
+  double res0 = 0;
+#pragma GCC no_reduc_chain
+  for (int i = 0 ; i < 1000; i+=4) {
+res0 += a[i] * b[i];
+res1 += a[i+1] * b[i*1];
+res0 += a[i+2] * b[i+2];
+res1 += a[i+3] * b[i+3];
+  }
+  return res0 + res1;
+}

for your case with IIRC V2DF vectors using reduction chains will
result in a vectorization factor of two while with a SLP reduction the
vectorization factor is one.  
So maybe it is better to give the user control over the vectorization
factor?  That's desirable in other cases where the user wants to force
a larger VF to get extra unrolling for example.  For the testcase above
you'd use

#pragma GCC vect vf(1)

or so (syntax to be discussed).  The side-effect would be that
with a reduction chain the VF request cannot be fulfilled but
with a SLP reduction it can.  Of course no_reduc_chain is much
easier to actually implement in a strict way while specifying
VF will likely need to be documented as a hint (with an eventual
diagnostic if it wasn't fulfilled)

Richard/Jakub, any thoughts?

Thanks,
Richard.


Re: [PATCH v2] vect/rs6000: Support vector with length cost modeling

2020-07-22 Thread Richard Biener via Gcc-patches
On Wed, Jul 22, 2020 at 3:26 AM Kewen.Lin  wrote:
>
> Hi Richard,
>
> on 2020/7/21 下午3:57, Richard Biener wrote:
> > On Tue, Jul 21, 2020 at 7:52 AM Kewen.Lin  wrote:
> >>
> >> Hi,
> >>
> >> This patch is to add the cost modeling for vector with length,
> >> it mainly follows what we generate for vector with length in
> >> functions vect_set_loop_controls_directly and vect_gen_len
> >> at the worst case.
> >>
> >> For Power, the length is expected to be in bits 0-7 (high bits),
> >> we have to model the cost of shifting bits.  To allow other targets
> >> not suffer this, I used one target hook to describe this extra cost,
> >> I'm not sure if it's a correct way.
> >>
> >> Bootstrapped/regtested on powerpc64le-linux-gnu (P9) with explicit
> >> param vect-partial-vector-usage=1.
> >>
> >> Any comments/suggestions are highly appreciated!
> >
> > I don't like the introduction of an extra target hook for this.  All
> > vectorizer cost modeling should ideally go through
> > init_cost/add_stmt_cost/finish_cost.  If the extra costing is
> > not per stmt then either init_cost or finish_cost is appropriate.
> > Currently init_cost only gets a struct loop while we should
> > probably give it a vec_info * parameter so targets can
> > check LOOP_VINFO_USING_PARTIAL_VECTORS_P and friends.
> >
>
> Thanks!  Nice, your suggested way looks better.  I've removed the hook
> and taken care of it in finish_cost.  The updated v2 is attached.
>
> Bootstrapped/regtested again on powerpc64le-linux-gnu (P9) with explicit
> param vect-partial-vector-usage=1.

LGTM (with assuming the first larger hunk is mostly re-indenting
under LOOP_VINFO_USING_PARTIAL_VECTORS_P).

Thanks,
Richard.

> BR,
> Kewen
> -
> gcc/ChangeLog:
>
> * config/rs6000/rs6000.c (adjust_vect_cost): New function.
> (rs6000_finish_cost): Call function adjust_vect_cost.
> * tree-vect-loop.c (vect_estimate_min_profitable_iters): Add cost
> modeling for vector with length.


Re: [PATCH] c++: decl_constant_value and unsharing [PR96197]

2020-07-22 Thread Richard Biener via Gcc-patches
On Tue, Jul 21, 2020 at 9:08 PM Patrick Palka via Gcc-patches
 wrote:
>
> In the testcase from the PR we are seeing excessive memory use (> 5GB)
> during constexpr evaluation, almost all of which is due to the call to
> decl_constant_value in the VAR_DECL/CONST_DECL branch of
> cxx_eval_constant_expression.  We reach here every time we evaluate an
> ARRAY_REF of a constexpr VAR_DECL, which in this testcase is quite
> often, and from there decl_constant_value makes an unshared copy of the
> VAR_DECL's initializer, even though the unsharing is not needed at this
> call site (because it is up to callers of cxx_eval_constant_expression
> to unshare).
>
> To fix this, this patch moves the responsibility of unsharing the result
> of decl_constant_value, decl_really_constant_value and
> scalar_constant_value from the callee to the caller.
>
> Fortunately there's only six calls to these functions, two of which are
> from cxx_eval_constant_expression where the unsharing is undesirable.
> And in unify there is one call, to scalar_constant_value, that looks
> like:
>
>case CONST_DECL:
>  if (DECL_TEMPLATE_PARM_P (parm))
>return ...;
> >if (arg != scalar_constant_value (parm))
>return ...;
>
> where we are suspiciously testing for pointer equality despite
> scalar_constant_value's unsharing behavior.  This line seems to be dead
> code however, so this patch replaces it with an appropriate gcc_assert.
> Finally, this patch adds an explicit call to unshare_expr to the
> remaining three callers.
>
> Now that the the calls to decl_constant_value and
> decl_really_constant_value from cxx_eval_constant_expression no longer
> unshare their result, memory use during constexpr evaluation for the
> testcase in the PR falls from 5GB to 15MB according to -ftime-report.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu, and also tested on
> cmcstl2 and a number of other libraries.  Does this look OK to commit?

Can you add the PRs testcase?  Thanks for tracking this down! (but I can't
approve the patch)

Richard.

> gcc/cp/ChangeLog:
>
> PR c++/96197
> * cp-gimplify.c (cp_fold_maybe_rvalue): Call unshare_expr on the
> result of decl_constant_value.
> * cvt.c: Include gimplify.h.
> (ocp_convert): Call unshare_expr on the result of
> scalar_constant_value.
> * init.c (constant_value_1): Don't call unshare_expr here,
> so that callers can choose whether to unshare.
> * pt.c (tsubst_copy): Call unshare_expr on the result of
> scalar_constant_value.
> (unify) : Assert DECL_TEMPLATE_PARM_P and
> simplify accordingly.
> ---
>  gcc/cp/cp-gimplify.c | 2 +-
>  gcc/cp/cvt.c | 3 ++-
>  gcc/cp/init.c| 2 +-
>  gcc/cp/pt.c  | 9 +++--
>  4 files changed, 7 insertions(+), 9 deletions(-)
>
> diff --git a/gcc/cp/cp-gimplify.c b/gcc/cp/cp-gimplify.c
> index 0e949e29c5c..5c5c44dbc5d 100644
> --- a/gcc/cp/cp-gimplify.c
> +++ b/gcc/cp/cp-gimplify.c
> @@ -2433,7 +2433,7 @@ cp_fold_maybe_rvalue (tree x, bool rval)
>   tree v = decl_constant_value (x);
>   if (v != x && v != error_mark_node)
> {
> - x = v;
> + x = unshare_expr (v);
>   continue;
> }
> }
> diff --git a/gcc/cp/cvt.c b/gcc/cp/cvt.c
> index c9e7b1ff044..a7e40a1fa51 100644
> --- a/gcc/cp/cvt.c
> +++ b/gcc/cp/cvt.c
> @@ -36,6 +36,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "stringpool.h"
>  #include "attribs.h"
>  #include "escaped_string.h"
> +#include "gimplify.h"
>
>  static tree convert_to_pointer_force (tree, tree, tsubst_flags_t);
>  static tree build_type_conversion (tree, tree);
> @@ -725,7 +726,7 @@ ocp_convert (tree type, tree expr, int convtype, int 
> flags,
>e = mark_rvalue_use (e);
>tree v = scalar_constant_value (e);
>if (!error_operand_p (v))
> -   e = v;
> +   e = unshare_expr (v);
>  }
>if (error_operand_p (e))
>  return error_mark_node;
> diff --git a/gcc/cp/init.c b/gcc/cp/init.c
> index ef4b3c4dc3c..bf229bd2ba5 100644
> --- a/gcc/cp/init.c
> +++ b/gcc/cp/init.c
> @@ -2343,7 +2343,7 @@ constant_value_1 (tree decl, bool strict_p, bool 
> return_aggregate_cst_ok_p)
>   && !DECL_INITIALIZED_BY_CONSTANT_EXPRESSION_P (decl)
>   && DECL_NONTRIVIALLY_INITIALIZED_P (decl))
> break;
> -  decl = unshare_expr (init);
> +  decl = init;
>  }
>return decl;
>  }
> diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c
> index 34876788a9c..4d3ee099cea 100644
> --- a/gcc/cp/pt.c
> +++ b/gcc/cp/pt.c
> @@ -16368,7 +16368,7 @@ tsubst_copy (tree t, tree args, tsubst_flags_t 
> complain, tree in_decl)
>   return t;
> /* If ARGS is NULL, then T is known to be non-dependent.  */
> if (args == NULL_TREE)
> - return scalar_constant_value (t);
> + return unshare_expr (scalar_constant_value (t));
>
> /* Unfortunately, we cannot 

Re: [PATCH] libgcc: Use `-fasynchronous-unwind-tables' for LIB2_DIVMOD_FUNCS

2020-07-22 Thread Richard Biener via Gcc-patches
On Tue, Jul 21, 2020 at 8:24 PM Maciej W. Rozycki  wrote:
>
> Complement commit b932f770f70d ("x86_64 frame unwind info"), SVN r46374,
> , and replace
> `-fexceptions -fnon-call-exceptions' with `-fasynchronous-unwind-tables'
> in LIB2_DIVMOD_FUNCS compilation flags so as to provide unwind tables
> for the affected functions while not pulling the unwinder proper, which
> is not required here.
>
> Remove the ARM overrides accordingly, retaining the hook infrastructure
> however, and make the ARM test case a generic one.

You're changing arm files only so CCing arm maintainers.

Richard.

> Beyond saving program space it fixes a RISC-V glibc build error due to
> unsatisfied `malloc' and `free' references from the unwinder causing
> link errors with `ld.so' where libgcc has been built at -O0.
> ---
> Hi,
>
>  As discussed: .
>
>  This has been regression-tested with all the GCC compiler testsuites with
> the `x86_64-linux' native configuration, which in particular means the
> moved ARM test case scored the UNSUPPORTED result.  I have no access to a
> non-Linux configuration right now, so I cannot verify this test case, but
> in principle I expect it to work across the relevant targets (and the
> irrelevant ones can be excluded as they are discovered).
>
>  OK to apply then?  It may make sense to backport this fix too to the
> active release branches; please let me know if to do so.
>
>  NB the original commit referred appears to contain more than just the
> corresponding mailing list posting; it looks like several patches were
> folded together before comitting, so this is as good as you can get.
>
>   Maciej
> ---
>  gcc/testsuite/gcc.dg/div64-unwinding.c |   25 
> +
>  gcc/testsuite/gcc.target/arm/div64-unwinding.c |   25 
> -
>  libgcc/Makefile.in |2 +-
>  libgcc/config/arm/t-bpabi  |5 -
>  libgcc/config/arm/t-netbsd-eabi|5 -
>  5 files changed, 26 insertions(+), 36 deletions(-)
>
> gcc-libgcc-divmod-asynchronous-unwind-tables.diff
> Index: gcc/gcc/testsuite/gcc.dg/div64-unwinding.c
> ===
> --- /dev/null
> +++ gcc/gcc/testsuite/gcc.dg/div64-unwinding.c
> @@ -0,0 +1,25 @@
> +/* Performing a 64-bit division should not pull in the unwinder.  */
> +
> +/* { dg-do run { target { { ! *-*-linux* } && { ! *-*-uclinux* } } } } */
> +/* { dg-skip-if "load causes weak symbol resolution" { vxworks_kernel } } */
> +/* { dg-options "-O0" } */
> +
> +#include 
> +
> +long long
> +foo (long long c, long long d)
> +{
> +  return c/d;
> +}
> +
> +long long x = 0;
> +long long y = 1;
> +
> +extern int (*_Unwind_RaiseException) (void *) __attribute__((weak));
> +
> +int main(void)
> +{
> +  if (&_Unwind_RaiseException != NULL)
> +abort ();;
> +  return foo (x, y);
> +}
> Index: gcc/gcc/testsuite/gcc.target/arm/div64-unwinding.c
> ===
> --- gcc.orig/gcc/testsuite/gcc.target/arm/div64-unwinding.c
> +++ /dev/null
> @@ -1,25 +0,0 @@
> -/* Performing a 64-bit division should not pull in the unwinder.  */
> -
> -/* { dg-do run { target { { ! *-*-linux* } && { ! *-*-uclinux* } } } } */
> -/* { dg-skip-if "load causes weak symbol resolution" { vxworks_kernel } } */
> -/* { dg-options "-O0" } */
> -
> -#include 
> -
> -long long
> -foo (long long c, long long d)
> -{
> -  return c/d;
> -}
> -
> -long long x = 0;
> -long long y = 1;
> -
> -extern int (*_Unwind_RaiseException) (void *) __attribute__((weak));
> -
> -int main(void)
> -{
> -  if (&_Unwind_RaiseException != NULL)
> -abort ();;
> -  return foo (x, y);
> -}
> Index: gcc/libgcc/Makefile.in
> ===
> --- gcc.orig/libgcc/Makefile.in
> +++ gcc/libgcc/Makefile.in
> @@ -533,7 +533,7 @@ endif
>  ifeq ($(LIB2_DIVMOD_EXCEPTION_FLAGS),)
>  # Provide default flags for compiling divmod functions, if they haven't been
>  # set already by a target-specific Makefile fragment.
> -LIB2_DIVMOD_EXCEPTION_FLAGS := -fexceptions -fnon-call-exceptions
> +LIB2_DIVMOD_EXCEPTION_FLAGS := -fasynchronous-unwind-tables
>  endif
>
>  # Build LIB2_DIVMOD_FUNCS.
> Index: gcc/libgcc/config/arm/t-bpabi
> ===
> --- gcc.orig/libgcc/config/arm/t-bpabi
> +++ gcc/libgcc/config/arm/t-bpabi
> @@ -13,8 +13,3 @@ LIB2ADDEH = $(srcdir)/config/arm/unwind-
>
>  # Add the BPABI names.
>  SHLIB_MAPFILES += $(srcdir)/config/arm/libgcc-bpabi.ver
> -
> -# On ARM, specifying -fnon-call-exceptions will needlessly pull in
> -# the unwinder in simple programs which use 64-bit division.  Omitting
> -# the option is safe.
> -LIB2_DIVMOD_EXCEPTION_FLAGS := -fexceptions
> Index: gcc/libgcc/config/arm/t-netbsd-eabi
>