date:20170607

Re: [PATCH GCC][4/5]Improve loop distribution to handle hmmer

2017-06-07 Thread kugan


Hi Bin,


+
+/* In reduced dependence graph RDG for loop distribution, return true if
+   dependence between references DR1 and DR2 may create dependence cycle
+   and such dependence cycle can't be resolved by runtime alias check.  */
+
+static bool
+possible_data_dep_cycle_p (struct graph *rdg,
+  hash_table *ddr_table,
+  data_reference_p dr1, data_reference_p dr2)


This name seems to be misleading a bit. It is basically dependence test 
? Of course this can lead to a cycle but looks like possible_data_dep_p 
would be better.



+{
+  struct data_dependence_relation *ddr;
+
+  /* Re-shuffle data-refs to be in topological order.  */
+  if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
+  > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
+std::swap (dr1, dr2);
+
+  ddr = get_ddr (rdg, ddr_table, dr1, dr2);
+
+  /* In case something goes wrong in data dependence analysis.  */
+  if (ddr == NULL)
+return true;
+  /* In case of no data dependence.  */
+  else if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
+return false;
+  /* Or the data dependence can be resolved by compilation time alias
+ check.  */
+  else if (!alias_sets_conflict_p (get_alias_set (DR_REF (dr1)),
+  get_alias_set (DR_REF (dr2
+return false;
+  /* For unknown data dependence or known data dependence which can't be
+ expressed in classic distance vector, we check if it can be resolved
+ by runtime alias check.  If yes, we still consider data dependence
+ as won't introduce data dependence cycle.  */
+  else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
+  || DDR_NUM_DIST_VECTS (ddr) == 0)


You have already handled chrec_known above. Can you still have known 
data dependence which can't be expressed in classic distance vector ?



+return !runtime_alias_check_p (ddr, NULL, true);
+  else if (DDR_NUM_DIST_VECTS (ddr) > 1)
+return true;
+  else if (DDR_REVERSED_P (ddr)
+  || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
+return false;
+
+  return true;
+}
  
  /* Returns a partition with all the statements needed for computing

 the vertex V of the RDG, also including the loop exit conditions.  */
  
  static partition *

-build_rdg_partition_for_vertex (struct graph *rdg, int v)
+build_rdg_partition_for_vertex (struct graph *rdg,
+   hash_table *ddr_table, int v)
  {
partition *partition = partition_alloc (NULL, NULL);
auto_vec nodes;
-  unsigned i;
+  unsigned i, j;
int x;
+  data_reference_p dr, dr1, dr2;
  
graphds_dfs (rdg, , 1, , false, NULL);
  
@@ -1005,8 +1262,43 @@ build_rdg_partition_for_vertex (struct graph *rdg, int v)

bitmap_set_bit (partition->stmts, x);
bitmap_set_bit (partition->loops,
  loop_containing_stmt (RDG_STMT (rdg, x))->num);
+
+  for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, ); ++j)
+   {
+ /* Partition can only be executed sequentially if there is any
+unknown data reference.  */
+ if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
+ || !DR_INIT (dr) || !DR_STEP (dr))
+   partition->type = PTYPE_SEQUENTIAL;
+
+ if (DR_IS_READ (dr))
+   partition->reads.safe_push (dr);
+ else
+   partition->writes.safe_push (dr);
+   }
  }
  
+  if (partition->type == PTYPE_SEQUENTIAL)

+return partition;
+
+  /* Further check if any data dependence prevents us from executing the
+ partition parallelly.  */
+  for (i = 0; partition->reads.iterate (i, ); ++i)
+for (j = 0; partition->writes.iterate (j, ); ++j)
+  if (possible_data_dep_cycle_p (rdg, ddr_table, dr1, dr2))
+   {
+ partition->type = PTYPE_SEQUENTIAL;
+ return partition;
+   }
+
+  for (i = 0; partition->writes.iterate (i, ); ++i)
+for (j = i + 1; partition->writes.iterate (j, ); ++j)
+  if (possible_data_dep_cycle_p (rdg, ddr_table, dr1, dr2))
+   {
+ partition->type = PTYPE_SEQUENTIAL;
+ return partition;
+   }
+
return partition;
  }
  
@@ -1014,7 +1306,9 @@ build_rdg_partition_for_vertex (struct graph *rdg, int v)

 For the moment we detect only the memset zero pattern.  */
  
  static void

-classify_partition (loop_p loop, struct graph *rdg, partition *partition)
+classify_partition (loop_p loop, struct graph *rdg,
+   hash_table *ddr_table,
+   partition *partition, bitmap stmt_in_all_partitions)
  {
bitmap_iterator bi;
unsigned i;
@@ -1022,6 +1316,7 @@ classify_partition (loop_p loop, struct graph *rdg, 
partition *partition)
data_reference_p single_load, single_store;
bool volatiles_p = false;
bool plus_one = false;
+  bool has_reduction = false;
  
partition->kind = PKIND_NORMAL;

partition->main_dr = NULL;
@@ -1036,16 +1331,24 @@ classify_partition (loop_p loop, struct graph *rdg, 
partition *partition)

Re: [PATCH] handle bzero/bcopy in DSE and aliasing (PR 80933, 80934)

2017-06-07 Thread Martin Sebor


On 06/07/2017 02:12 PM, Martin Sebor wrote:

On 06/07/2017 02:01 PM, Marc Glisse wrote:

On Wed, 7 Jun 2017, Bernhard Reutner-Fischer wrote:


On 7 June 2017 16:46:53 CEST, Martin Sebor  wrote:

On 06/07/2017 02:23 AM, Richard Biener wrote:

On Wed, Jun 7, 2017 at 5:26 AM, Martin Sebor 

wrote:

Note I'd be _much_ more sympathetic to simply canonicalizing all of
bzero and bcopy
to memset / memmove and be done with all the above complexity.



Attached is an updated patch along these lines.  Please let me
know if it matches your expectations.


I think you attached the wrong patch.


Yes I did, sorry.  The correct one is attached.


Under POSIX.1-2008 "optimizing" bzero or bcmp is IMO plain wrong.

It's like optimizing foo() to a random built-in but maybe that's just
me. If your libc provides a define to a standard function for these
under a compat knob then fine but otherwise you should fix that.
*shrug*. Joseph?


The patch optimizes __builtin_bzero, which should be ok. The question
(independent from this patch) is then under what conditions bzero should
be detected as a builtin.


Yes.  The problem is that unlike for C and C++, GCC doesn't have
a mechanism to select the target version of POSIX.  I think it
should.

But there is a subtle problem with the patch that needs fixing.
Bcopy should not be transformed to memcpy but rather memmove.
I'll fix that before committing.


Attached is an updated patch with this fix.  I also added a cast
from bcopy and bzero to void to detect accidental uses of the
return value.  Tested on x86_64-linux.

Martin
PR tree-optimization/80934 - bzero should be assumed not to escape pointer argument
PR tree-optimization/80933 - redundant bzero/bcopy calls not eliminated

gcc/ChangeLog:

	PR tree-optimization/80933
	PR tree-optimization/80934
	* builtins.c (fold_builtin_bcmp, fold_builtin_bcopy): New functions.
	(fold_builtin_bzero): Likewise.
	(fold_builtin_2): Handle bzero.
	(fold_builtin_3): Handle bcmp and bcpy.

gcc/testsuite/ChangeLog:

	PR tree-optimization/80933
	PR tree-optimization/80934
	* gcc.dg/fold-bcopy.c: New test.
	* gcc.dg/tree-ssa/ssa-dse-30.c: Likewise..
	* gcc.dg/tree-ssa/alias-36.c: Likewise.
	* gcc/testsuite/gcc.dg/pr79214.c: Adjust.
	* gcc.dg/tree-prof/val-prof-7.c: Likewise.
	* gcc.dg/Wsizeof-pointer-memaccess1.c: Likewise.
	* gcc.dg/builtins-nonnull.c: Likewise.

diff --git a/gcc/builtins.c b/gcc/builtins.c
index 30462ad..52d42b9 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -145,6 +145,9 @@ static rtx expand_builtin_unop (machine_mode, tree, rtx, rtx, optab);
 static rtx expand_builtin_frame_address (tree, tree);
 static tree stabilize_va_list_loc (location_t, tree, int);
 static rtx expand_builtin_expect (tree, rtx);
+static tree fold_builtin_bcmp (location_t, tree, tree, tree);
+static tree fold_builtin_bcopy (location_t, tree, tree, tree);
+static tree fold_builtin_bzero (location_t, tree, tree);
 static tree fold_builtin_constant_p (tree);
 static tree fold_builtin_classify_type (tree);
 static tree fold_builtin_strlen (location_t, tree, tree);
@@ -7982,6 +7985,56 @@ fold_builtin_sincos (location_t loc,
 			 fold_build1_loc (loc, REALPART_EXPR, type, call)));
 }
 
+/* Fold function call to built-in bzero with arguments SRC and LEN
+   into a call to built-in memset (DST, 0, LEN).  */
+
+static tree
+fold_builtin_bzero (location_t loc, tree dst, tree len)
+{
+  if (!validate_arg (dst, POINTER_TYPE)
+  || !validate_arg (len, INTEGER_TYPE))
+return NULL_TREE;
+
+  tree fn = builtin_decl_implicit (BUILT_IN_MEMSET);
+  /* Call memset and return the result cast to void to detect its use
+ (bzero returns void).  */
+  tree call = build_call_expr_loc (loc, fn, 3, dst, integer_zero_node, len);
+  return fold_convert (void_type_node, call);
+}
+
+/* Fold function call to built-in bcmp with arguments ARG1, ARG2, and LEN
+   into a call to built-in memcmp(ARG1, ARG2, LEN).  */
+
+static tree
+fold_builtin_bcmp (location_t loc, tree arg1, tree arg2, tree len)
+{
+  if (!validate_arg (arg1, POINTER_TYPE)
+  || !validate_arg (arg2, POINTER_TYPE)
+  || !validate_arg (len, INTEGER_TYPE))
+return NULL_TREE;
+
+  tree fn = builtin_decl_implicit (BUILT_IN_MEMCMP);
+  return build_call_expr_loc (loc, fn, 3, arg1, arg2, len);
+}
+
+/* Fold function call to built-in bcopy with arguments SRC, DST, and LEN
+   into a call to built-in memmove(DST, SRC, LEN).  */
+
+static tree
+fold_builtin_bcopy (location_t loc, tree src, tree dst, tree len)
+{
+  if (!validate_arg (src, POINTER_TYPE)
+  || !validate_arg (dst, POINTER_TYPE)
+  || !validate_arg (len, INTEGER_TYPE))
+return NULL_TREE;
+
+  /* bcopy has been removed from POSIX in Issue 7 but Issue 6 specifies
+ it's quivalent to memmove (not memcpy).  Call memmove and return
+ the result cast to void to detect its use (bcopy returns void).  */
+  tree fn = builtin_decl_implicit (BUILT_IN_MEMMOVE);
+  return build_call_expr_loc (loc, fn, 3,

Re: [PATCH] warn on mem calls modifying objects of non-trivial types (PR 80560)

2017-06-07 Thread Jason Merrill


On 06/06/2017 03:24 PM, Martin Sebor wrote:

+  /* Iterate over copy and move assignment overloads.  */
+
+  for (ovl_iterator oi (fns); oi; ++oi)
+{
+  tree f = *oi;
+
+  bool accessible = !access || !(TREE_PRIVATE (f) || TREE_PROTECTED (f));
+
+  /* Skip template assignment operators and deleted functions.  */
+  if (TREE_CODE (f) != FUNCTION_DECL || DECL_DELETED_FN (f))
+   continue;
+
+  if (accessible)
+   *hasassign = true;
+
+  if (!accessible || !trivial_fn_p (f))
+   all_trivial = false;
+
+  /* Break early when both properties have been determined.  */
+  if (*hasassign && !all_trivial)
+   break;
+}


This is iterating over all assignment operators, not just copy/move.  I 
think you want copy_fn_p, here and in has_trivial_copy_p.


It seems redundant to check access here and also check is 
is_trivially_xible, which takes access into account.  And if we're going 
to check access at all, it should consider the access of the current 
scope, not just whether the function is private or protected.


Jason

Re: [PATCH 1/2] x86,s390: add compiler memory barriers when expanding atomic_thread_fence (PR 80640)

2017-06-07 Thread Alexander Monakov

On Fri, 26 May 2017, Alexander Monakov wrote:
> > Ping.
> 
> Ping^2?

Ping^3.

> > (to be clear, patch 2/2 is my previous followup in this thread, I forgot to
> > adjust the subject line; it should have said:
> > "[PATCH 2/2] x86: add compiler memory barriers when expanding atomic_load").
> > 
> > On Wed, 10 May 2017, Alexander Monakov wrote:
> > 
> > > Hi,
> > > 
> > > When expanding __atomic_thread_fence(x) to RTL, the i386 backend doesn't 
> > > emit
> > > any instruction except for x==__ATOMIC_SEQ_CST (which emits 'mfence').  
> > > This 
> > > is incorrect: although no machine barrier is needed, the compiler still 
> > > must
> > > emit a compiler barrier into the IR to prevent propagation and code motion
> > > across the fence.  The testcase added with the patch shows how it can lead
> > > to a miscompilation.
> > > 
> > > The proposed patch fixes it by handling non-seq-cst fences exactly like
> > > __atomic_signal_fence is expanded, by emitting asm 
> > > volatile("":::"memory").
> > > 
> > > The s390 backend uses the a similar mem_thread_fence expansion, so the 
> > > patch
> > > fixes both backends in the same manner.
> > > 
> > > Bootstrapped and regtested on x86_64; also checked that s390-linux cc1
> > > successfully builds after the change.  OK for trunk?
> > > 
> > > (the original source code in the PR was misusing atomic fences by doing
> > > something like
> > > 
> > >   void f(int *p)
> > >   {
> > > while (*p)
> > >   __atomic_thread_fence(__ATOMIC_ACQUIRE);
> > >   }
> > > 
> > > but since *p is not atomic, a concurrent write to *p would cause a data 
> > > race and
> > > thus invoke undefined behavior; also, if *p is false prior to entering 
> > > the loop,
> > > execution does not encounter the fence; new test here has code usable 
> > > without UB)
> > > 
> > > Alexander
> > > 
> > >   * config/i386/sync.md (mem_thread_fence): Emit a compiler barrier for
> > >   non-seq-cst fences.  Adjust comment.
> > >   * config/s390/s390.md (mem_thread_fence): Likewise.
> > >   * optabs.c (expand_asm_memory_barrier): Export.
> > >   * optabs.h (expand_asm_memory_barrier): Declare.
> > > testsuite/
> > >   * gcc.target/i386/pr80640-1.c: New testcase.
> > > ---
> > >  gcc/config/i386/sync.md   |  7 ++-
> > >  gcc/config/s390/s390.md   | 11 +--
> > >  gcc/optabs.c  |  2 +-
> > >  gcc/optabs.h  |  3 +++
> > >  gcc/testsuite/gcc.target/i386/pr80640-1.c | 12 
> > >  5 files changed, 31 insertions(+), 4 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr80640-1.c
> > > 
> > > diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
> > > index 20d46fe..619d53b 100644
> > > --- a/gcc/config/i386/sync.md
> > > +++ b/gcc/config/i386/sync.md
> > > @@ -108,7 +108,7 @@ (define_expand "mem_thread_fence"
> > >enum memmodel model = memmodel_from_int (INTVAL (operands[0]));
> > >  
> > >/* Unless this is a SEQ_CST fence, the i386 memory model is strong
> > > - enough not to require barriers of any kind.  */
> > > + enough not to require a processor barrier of any kind.  */
> > >if (is_mm_seq_cst (model))
> > >  {
> > >rtx (*mfence_insn)(rtx);
> > > @@ -124,6 +124,11 @@ (define_expand "mem_thread_fence"
> > >  
> > >emit_insn (mfence_insn (mem));
> > >  }
> > > +  else if (!is_mm_relaxed (model))
> > > +{
> > > +  /* However, a compiler barrier is still required.  */
> > > +  expand_asm_memory_barrier ();
> > > +}
> > >DONE;
> > >  })
> > >  
> > > diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
> > > index c9fd19a..65e54c4 100644
> > > --- a/gcc/config/s390/s390.md
> > > +++ b/gcc/config/s390/s390.md
> > > @@ -10109,14 +10109,21 @@ (define_expand "mem_thread_fence"
> > >[(match_operand:SI 0 "const_int_operand")] ;; model
> > >""
> > >  {
> > > +  enum memmodel model = memmodel_from_int (INTVAL (operands[0]));
> > > +
> > >/* Unless this is a SEQ_CST fence, the s390 memory model is strong
> > > - enough not to require barriers of any kind.  */
> > > -  if (is_mm_seq_cst (memmodel_from_int (INTVAL (operands[0]
> > > + enough not to require a processor barrier of any kind.  */
> > > +  if (is_mm_seq_cst (model))
> > >  {
> > >rtx mem = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
> > >MEM_VOLATILE_P (mem) = 1;
> > >emit_insn (gen_mem_thread_fence_1 (mem));
> > >  }
> > > +  else if (!is_mm_relaxed (model))
> > > +{
> > > +  /* However, a compiler barrier is still required.  */
> > > +  expand_asm_memory_barrier ();
> > > +}
> > >DONE;
> > >  })
> > >  
> > > diff --git a/gcc/optabs.c b/gcc/optabs.c
> > > index 48e37f8..1f1fbc3 100644
> > > --- a/gcc/optabs.c
> > > +++ b/gcc/optabs.c
> > > @@ -6269,7 +6269,7 @@ expand_atomic_compare_and_swap (rtx *ptarget_bool, 
> > > rtx *ptarget_oval,
> > >  
> > >  /*

Re: Reorgnanization of profile count maintenance code, part 1

2017-06-07 Thread Segher Boessenkool

On Wed, Jun 07, 2017 at 11:11:08PM +0200, Jan Hubicka wrote:
> > On Tue, Jun 06, 2017 at 10:25:29PM +0200, Jan Hubicka wrote:
> > > > On Thu, Jun 01, 2017 at 01:35:56PM +0200, Jan Hubicka wrote:
> > > > > +  /* FIXME: shrink wrapping violates this sanity check.  */
> > > > > +  gcc_checking_assert ((num >= 0
> > > > > + && (num <= REG_BR_PROB_BASE
> > > > > + || den <= REG_BR_PROB_BASE)
> > > > > + && den > 0) || 1);
> > > > > +  ret.m_val = RDIV (m_val * num, den);
> > > > > +  return ret;
> > > > 
> > > > Sorry if I missed this...  But where/how does it violate this?
> > > 
> > > It sums multiple probabilties together and overflows the limit.
> > 
> > Ah.  Yes, the scale it uses (num/den in shrink-wrap.c:924) isn't a
> > probability: it's a just a fraction, but also <= 1.  The calculation cannot
> > overflow, not while there are at most 4G incoming edges to a BB.
> 
> Problem is that when you multiply count by it, the buffer is considerably
> smaller, because count itself can be quite large number...

Oh duh, I stupidly looked at the frequency thing, not the count thing.

> It is not too bad - I added the sanity check mostly to see if there
> is a safe cap on num so I do not need to worry about overflows at all.

For count we can round a bit without causing problems, I think?  No
such luxury with frequency, but, that's not the issue here :-)


Segher

Re: Handle data dependence relations with different bases

2017-06-07 Thread Richard Sandiford

Richard Biener  writes:
>>> How does this look?  Changes since v1:
>>>
>>> - Added access_fn_component_p to check for valid access function components.
>>>
>>> - Added access_fn_components_comparable_p instead of using
>>>   types_compatibloe_p directly.
>>>
>>> - Added more commentary.
>>>
>>> - Added local structures to represent the sequence, so that it's
>>>   more obvious which variables are temporaries and which aren't.
>>>
>>> - Added the test above to vect-alias-check-3.c.
>>>
>>> Tested on aarch64-linux-gnu and x86_64-linux-gnu.
>
> This is ok.

Thanks.  Just been retesting, and I think I must have forgotten
to include Ada last time.  It turns out that the patch causes a dg-scan
regression in gnat.dg/vect17.adb, because we now think that if the
array RECORD_TYPEs *do* alias in:

   procedure Add (X, Y : aliased Sarray; R : aliased out Sarray) is
   begin
  for I in Sarray'Range loop
 R(I) := X(I) + Y(I);
  end loop;
   end;

then the dependence distance must be zero.  Eric, does that hold true
for Ada?  I.e. if X and R (or Y and R) alias, must it be the case that
X(I) can only alias R(I) and not for example R(I-1) or R(I+1)?  Or are
the arrays allowed to overlap by an arbitrary number of indices?

If the assumption is correct, is the patch below OK?

Thanks,
Richard


2017-06-07  Richard Sandiford  

gcc/testsuite/
* gnat.dg/vect17.ads (Sarray): Increase range to 1 .. 5.
* gnat.dg/vect17.adb (Add): Create a dependence distance of 1
when X = R or Y = R.

Index: gcc/testsuite/gnat.dg/vect17.ads
===
--- gcc/testsuite/gnat.dg/vect17.ads2015-10-14 14:58:56.0 +0100
+++ gcc/testsuite/gnat.dg/vect17.ads2017-06-07 22:10:24.796368118 +0100
@@ -1,6 +1,6 @@
 package Vect17 is
 
-   type Sarray is array (1 .. 4) of Long_Float;
+   type Sarray is array (1 .. 5) of Long_Float;
for Sarray'Alignment use 16;
 
procedure Add (X, Y : aliased Sarray; R : aliased out Sarray);
Index: gcc/testsuite/gnat.dg/vect17.adb
===
--- gcc/testsuite/gnat.dg/vect17.adb2015-10-14 14:58:56.0 +0100
+++ gcc/testsuite/gnat.dg/vect17.adb2017-06-07 22:10:24.796368118 +0100
@@ -5,8 +5,9 @@ package body Vect17 is
 
procedure Add (X, Y : aliased Sarray; R : aliased out Sarray) is
begin
-  for I in Sarray'Range loop
- R(I) := X(I) + Y(I);
+  R(1) := X(5) + Y(5);
+  for I in 1 .. 4 loop
+ R(I + 1) := X(I) + Y(I);
   end loop;
end;

Re: [PATCH, rs6000] PR 80982 gcc.target/powerpc/builtins-3-runnable.c fails starting with its introduction in r248846

2017-06-07 Thread Segher Boessenkool

On Wed, Jun 07, 2017 at 12:16:09PM -0700, Carl E. Love wrote:
> Bug 80982 - gcc.target/powerpc/builtins-3-runnable.c fails starting with
> its introduction in r248846. 
> 
> The failure was due to GEN_INT (8) being out of bounds.  Once that was
> fixed, the order of the arguments also needed fixing to get the correct
> result.  These changes only apply to the BE code.
> 
> The following patch has been tested on powerpc64le-unknown-linux-gnu
> (Power 8 LE) and on powerpc64-unknown-linux-gnu (Power 8 BE) with no
> regressions.
> 
> Is the patch OK for gcc mainline?

Yes please.  Modulo a changelog thing:

> 2017-06-07  Carl Love  
> 
>   PR target/80982
>   * config/rs6000/altivec.md: Fix the implementation of
>   (define_expand "doublel2" for BE.

* config/rs6000/altivec.md (double2): Fix the implementation
for BE.

Thanks,


Segher

Re: [PATCH, rs6000] Fix vec_mulo and vec_mule builtin implementations

2017-06-07 Thread Segher Boessenkool

On Tue, Jun 06, 2017 at 03:55:28PM -0700, Carl E. Love wrote:
> The support for the vec_mulo and vec_mule that was recently submitted
> has a couple of bugs.  Specifically, they were implemented with
> int/unsigned int args and return int/unsigned int.  The return types
> should have been long long/unsigned long long.  Additionally it was
> noted that unsigned version returned a signed version by mistake.  
> 
> The following patch fixes these issues.  The patch has been tested on
> powerpc64le-unknown-linux-gnu (Power 8 LE) and on
> powerpc64-unknown-linux-gnu (Power 8 BE) with no regressions.
> 
> Is the patch OK for gcc mainline?

Looks good.  Thanks!


Segher


> 2017-06-08  Carl Love  
> 
>   * config/rs6000/rs6000-c: The return type of the following
>   built-in functions was implemented as int not long long.  Fix sign
>   of return value for the unsigned version of vec_mulo and vec_mule.
>   vector unsigned long long vec_bperm (vector unsigned long long,
>vector unsigned char)
>   vector signed long long vec_mule (vector signed int,
> vector signed int)
>   vector unsigned long long vec_mule (vector unsigned int,
>   vector unsigned int)
>   vector signed long long vec_mulo (vector signed int,
> vector signed int)
>   vector unsigned long long vec_mulo (vector unsigned int,
>   vector unsigned int)
>   * doc/extend.texi: Fix the documentation for the
>   built-in functions.
> 
> gcc/testsuite/ChangeLog:
> 
> 2017-06-08  Carl Love  
> 
>   * gcc.target/powerpc/builtins-3.c: Fix vec_mule, vec_mulo test cases.

Re: Reorgnanization of profile count maintenance code, part 1

2017-06-07 Thread Jan Hubicka

> On Tue, Jun 06, 2017 at 10:25:29PM +0200, Jan Hubicka wrote:
> > > On Thu, Jun 01, 2017 at 01:35:56PM +0200, Jan Hubicka wrote:
> > > > +  /* FIXME: shrink wrapping violates this sanity check.  */
> > > > +  gcc_checking_assert ((num >= 0
> > > > +   && (num <= REG_BR_PROB_BASE
> > > > +   || den <= REG_BR_PROB_BASE)
> > > > +   && den > 0) || 1);
> > > > +  ret.m_val = RDIV (m_val * num, den);
> > > > +  return ret;
> > > 
> > > Sorry if I missed this...  But where/how does it violate this?
> > 
> > It sums multiple probabilties together and overflows the limit.
> 
> Ah.  Yes, the scale it uses (num/den in shrink-wrap.c:924) isn't a
> probability: it's a just a fraction, but also <= 1.  The calculation cannot
> overflow, not while there are at most 4G incoming edges to a BB.

Problem is that when you multiply count by it, the buffer is considerably
smaller, because count itself can be quite large number...

It is not too bad - I added the sanity check mostly to see if there
is a safe cap on num so I do not need to worry about overflows at all.

Honza
> 
> 
> Segher

Re: Containers default initialization

2017-06-07 Thread François Dumont


On 05/06/2017 13:31, Jonathan Wakely wrote:

On 04/06/17 22:26 +0200, François Dumont wrote:

Hi

   I have eventually adapt the test to all containers and the result 
is successful for map/set/unordered_map/unordered_set. It is failing 
for deque/list/forward_list/vector/vector.


   I even try to change the test to look at the difference between an 
explicit call to the default constructor done through the placement 
new call and an implicit call done on normal declaration. I wondered 
if we would have the same kind of difference we have between a int i; 
and a int i(); I tried to set the stack to ~0 before declaring the 
instance. I know there is no guarantee on the content of the stack 
for the following declaration but do you think it is reliable enough 
to commit it ?


   Ok to commit the successful tests ?



No, I'm seeing failures for some of these if I add
// { dg-options "-O0" }

   Franckly I don't understand the result of those tests. I would 
have expect map/set to fail and others to succeed. We might need help 
from compiler guys, no ?


I think your tests are just insufficient. With optimisation enabled
(the testsuite uses -O2 by default) the compiler can remove the memset
just before the __aligned_buffer goes out of scope, because it is
unobservable in a correct program. This is similar to the situation
described at https://gcc.gnu.org/gcc-6/porting_to.html#flifetime-dse

If I change the placement new-expressions to use default-init instead
of value-init and use -O0 then I see all four tests FAIL here:

test_type *tmp = ::new(buf._M_addr()) test_type; // not test_type()


Ok, I didn't know we could do this.

So I have added value_init.cc tests showing the problem for all containers.

The patch also contains the fix for rb_tree so that map/set are now 
successful.


Looks there is really a misunderstanding on what the compiler is doing. 
If container calls _Node_allocator() it will be transform by compiler 
into default initialization if container default container is being 
called or into value init if container is value initialized which takes 
place only if I call this:


test_type *tmp = ::new(buf._M_addr()) test_type {};

To force default/value init looks like gcc forces you to explicitly 
build an allocator instance like in the attached patch.


François

Index: include/bits/stl_tree.h
===
--- include/bits/stl_tree.h	(revision 248855)
+++ include/bits/stl_tree.h	(working copy)
@@ -687,9 +687,17 @@
 
 #if __cplusplus < 201103L
 	  _Rb_tree_impl()
+	  : _Node_allocator()
 	  { }
 #else
-	  _Rb_tree_impl() = default;
+	  _Rb_tree_impl()
+	noexcept(
+		noexcept(_Node_allocator()) && noexcept(_Base_key_compare()) )
+	  : _Rb_tree_impl(_Key_compare(), _Node_allocator())
+	  { }
+#endif
+
+#if __cplusplus >= 201103L
 	  _Rb_tree_impl(_Rb_tree_impl&&) = default;
 #endif
 
Index: testsuite/23_containers/map/allocator/default_init.cc
===
--- testsuite/23_containers/map/allocator/default_init.cc	(nonexistent)
+++ testsuite/23_containers/map/allocator/default_init.cc	(working copy)
@@ -0,0 +1,52 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// .
+
+// { dg-do run { target c++11 } }
+// { dg-options "-O0" }
+
+#include 
+#include 
+#include 
+
+#include 
+
+using T = int;
+
+using __gnu_test::default_init_allocator;
+
+void test01()
+{
+  typedef default_init_allocator alloc_type;
+  typedef std::map test_type;
+
+  __gnu_cxx::__aligned_buffer buf;
+  __builtin_memset(buf._M_addr(), ~0, sizeof(test_type));
+
+  VERIFY( buf._M_ptr()->get_allocator().state != 0 );
+  
+  test_type *tmp = ::new(buf._M_addr()) test_type();
+
+  VERIFY( tmp->get_allocator().state == 0 );
+
+  tmp->~test_type();
+}
+
+int main()
+{
+  test01();
+  return 0;
+}
Index: testsuite/23_containers/map/allocator/value_init.cc
===
--- testsuite/23_containers/map/allocator/value_init.cc	(nonexistent)
+++ testsuite/23_containers/map/allocator/value_init.cc	(working copy)
@@ -0,0 +1,52 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++

Re: [PATCH] handle bzero/bcopy in DSE and aliasing (PR 80933, 80934)

2017-06-07 Thread Joseph Myers

On Wed, 7 Jun 2017, Bernhard Reutner-Fischer wrote:

> It's like optimizing foo() to a random built-in but maybe that's just 
> me. If your libc provides a define to a standard function for these 
> under a compat knob then fine but otherwise you should fix that. 
> *shrug*. Joseph?

It's the usual thing of: if the -std option enables a function as built-in 
it can be assumed to have certain semantics if called (this does *not* 
mean it's appropriate to generate calls to it from code that called other 
functions, as the library might not have it at all; that's for the 
libc_has_function hook to specify).  Otherwise (-std=c11 etc. options) 
semantics cannot be assumed based on the name (they still can for 
__builtin_*) - but in principle it may be OK to assume semantics for a 
function whose name is reserved in the relevant C standard version, or if 
an explicit declaration came from a system header (which would generally 
be the case for e.g. -std=c99 -D_POSIX_C_SOURCE=whatever).

-- 
Joseph S. Myers
jos...@codesourcery.com

Re: [PATCH] handle bzero/bcopy in DSE and aliasing (PR 80933, 80934)

2017-06-07 Thread Martin Sebor


On 06/07/2017 02:01 PM, Marc Glisse wrote:

On Wed, 7 Jun 2017, Bernhard Reutner-Fischer wrote:


On 7 June 2017 16:46:53 CEST, Martin Sebor  wrote:

On 06/07/2017 02:23 AM, Richard Biener wrote:

On Wed, Jun 7, 2017 at 5:26 AM, Martin Sebor 

wrote:

Note I'd be _much_ more sympathetic to simply canonicalizing all of
bzero and bcopy
to memset / memmove and be done with all the above complexity.



Attached is an updated patch along these lines.  Please let me
know if it matches your expectations.


I think you attached the wrong patch.


Yes I did, sorry.  The correct one is attached.


Under POSIX.1-2008 "optimizing" bzero or bcmp is IMO plain wrong.

It's like optimizing foo() to a random built-in but maybe that's just
me. If your libc provides a define to a standard function for these
under a compat knob then fine but otherwise you should fix that.
*shrug*. Joseph?


The patch optimizes __builtin_bzero, which should be ok. The question
(independent from this patch) is then under what conditions bzero should
be detected as a builtin.


Yes.  The problem is that unlike for C and C++, GCC doesn't have
a mechanism to select the target version of POSIX.  I think it
should.

But there is a subtle problem with the patch that needs fixing.
Bcopy should not be transformed to memcpy but rather memmove.
I'll fix that before committing.

Martin

Re: [PATCH] handle bzero/bcopy in DSE and aliasing (PR 80933, 80934)

2017-06-07 Thread Marc Glisse


On Wed, 7 Jun 2017, Bernhard Reutner-Fischer wrote:


On 7 June 2017 16:46:53 CEST, Martin Sebor  wrote:

On 06/07/2017 02:23 AM, Richard Biener wrote:

On Wed, Jun 7, 2017 at 5:26 AM, Martin Sebor 

wrote:

Note I'd be _much_ more sympathetic to simply canonicalizing all of
bzero and bcopy
to memset / memmove and be done with all the above complexity.



Attached is an updated patch along these lines.  Please let me
know if it matches your expectations.


I think you attached the wrong patch.


Yes I did, sorry.  The correct one is attached.


Under POSIX.1-2008 "optimizing" bzero or bcmp is IMO plain wrong.

It's like optimizing foo() to a random built-in but maybe that's just 
me. If your libc provides a define to a standard function for these 
under a compat knob then fine but otherwise you should fix that. 
*shrug*. Joseph?


The patch optimizes __builtin_bzero, which should be ok. The question 
(independent from this patch) is then under what conditions bzero should 
be detected as a builtin.


--
Marc Glisse

[PATCH, rs6000] PR 80982 gcc.target/powerpc/builtins-3-runnable.c fails starting with its introduction in r248846

2017-06-07 Thread Carl E. Love

GCC Maintainers:

The support for vec_doublel () builtin didn't get re-tested on BE before
submission.  The following buzilla was filed against the builtin
support.


Bug 80982 - gcc.target/powerpc/builtins-3-runnable.c fails starting with
its introduction in r248846. 

The failure was due to GEN_INT (8) being out of bounds.  Once that was
fixed, the order of the arguments also needed fixing to get the correct
result.  These changes only apply to the BE code.

The following patch has been tested on powerpc64le-unknown-linux-gnu
(Power 8 LE) and on powerpc64-unknown-linux-gnu (Power 8 BE) with no
regressions.

Is the patch OK for gcc mainline?

  Carl Love

-
PR 80982 gcc.target/powerpc/builtins-3-runnable.c fails
 starting   with its introduction in r248846.

gcc/ChangeLog:

2017-06-07  Carl Love  

PR target/80982
* config/rs6000/altivec.md: Fix the implementation of
(define_expand "doublel2" for BE.
---
 gcc/config/rs6000/altivec.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 9e592bc..487b9a4 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -3002,9 +3002,9 @@
   emit_insn (gen_vsx_xxsldwi_ (rtx_tmp, operands[1],
 operands[1], rtx_val));
 
-  rtx_val = GEN_INT (8);
-  emit_insn (gen_vsx_xxsldwi_ (rtx_tmp, rtx_tmp,
-operands[1],rtx_val));
+  rtx_val = GEN_INT (2);
+  emit_insn (gen_vsx_xxsldwi_ (rtx_tmp, operands[1],
+rtx_tmp, rtx_val));
   emit_insn (gen_vsx_xvcvdp (operands[0], rtx_tmp));
 }
   else
-- 
1.9.1

Re: C/C++ PATCH to implement -Wmultiline-expansion (PR c/80116)

2017-06-07 Thread Pedro Alves

Hi Marek,

Nice warning!  Just to confirm, would the patch warn with code like:

 const char *
 target_xfer_status_to_string (enum target_xfer_status status)
 {
#define CASE(X) case X: return #X
   switch (status)
 {
   CASE(TARGET_XFER_E_IO);
   CASE(TARGET_XFER_UNAVAILABLE);
 default:
   return "";
 }
#undef CASE
 };

?

I think it shouldn't, but I couldn't tell from the tests,
and the only similar instance I found in gcc is guarded
behind an #ifdef (in eh_data_format_name).

Thanks,
Pedro Alves

Re: Reorgnanization of profile count maintenance code, part 1

2017-06-07 Thread Segher Boessenkool

On Tue, Jun 06, 2017 at 10:25:29PM +0200, Jan Hubicka wrote:
> > On Thu, Jun 01, 2017 at 01:35:56PM +0200, Jan Hubicka wrote:
> > > +  /* FIXME: shrink wrapping violates this sanity check.  */
> > > +  gcc_checking_assert ((num >= 0
> > > + && (num <= REG_BR_PROB_BASE
> > > + || den <= REG_BR_PROB_BASE)
> > > + && den > 0) || 1);
> > > +  ret.m_val = RDIV (m_val * num, den);
> > > +  return ret;
> > 
> > Sorry if I missed this...  But where/how does it violate this?
> 
> It sums multiple probabilties together and overflows the limit.

Ah.  Yes, the scale it uses (num/den in shrink-wrap.c:924) isn't a
probability: it's a just a fraction, but also <= 1.  The calculation cannot
overflow, not while there are at most 4G incoming edges to a BB.


Segher

Re: C/C++ PATCH to implement -Wmultiline-expansion (PR c/80116)

2017-06-07 Thread Marek Polacek

On Fri, Jun 02, 2017 at 03:50:12PM -0600, Martin Sebor wrote:
> On 06/02/2017 10:52 AM, Marek Polacek wrote:
> > On Thu, Jun 01, 2017 at 04:17:24PM -0600, Martin Sebor wrote:
> > > Very nice.  I think David already suggested handling other statements
> > > besides if (do/while), so let me just add for and switch (as in:
> > > 'switch (1) case SWAP (i, j);')
> > 
> > How's that one problematic, though?
> 
> The same way as 'if (1) SWAP (i, j);' because it expands to
> 
>   switch (1) case 1: tmp = i;
>   i = j;
>   j = tmp;
> 
> (I had a typo there so maybe that obscured the problem.)

Ah, ok, I can see now.  Actually the C++ FE would already warn with my patch,
but the C FE needed small changes to warn.

> > 
> > > The location in the warning look like it could be improved to extend
> > > from just the first column to the whole macro argument but I don't
> > > suppose that's under the direct control of your patch.
> > 
> > Well, for e.g. the "in expasion" message we have a good location range:
> >   SWAP
> >   ^~~~
> > so do you mean this?
> > tmp = x;
> > ^
> 
> Yes.
> 
> > ?  But yea, it's outside the scope of this patch.
> > 
> > > Besides the statements already mentioned above, here are a couple
> > > of corner cases I noticed are not handled while playing with the
> > > patch:
> > > 
> > >   define M(x) x
> > > 
> > >   int f (int i)
> > >   {
> > > if (i)
> > >   M (--i; --i);   // can this be handled?
> > > 
> > > return i;
> > >   }
> > > 
> > > and
> > > 
> > >   define M(x) x; x
> > > 
> > >   int f (int i)
> > >   {
> > > if (i)
> > >   M (--i; --i);   // seems like this should be handled
> > > 
> > > return i;
> > >   }
> > 
> > Hmm, I was hoping my patch would warn for both examples, but it doesn't.  
> > I'll
> > have to get back to this a ponder more.

I still haven't looked into this.  Might not be easy so I think let's not
block the patch on this.

> > > As an aside since it's outside the subset of the bigger problem
> > > you chose to solve, there is a related issue with macros that
> > > expand to an unparenthesized binary (and even some unary)
> > > expression:
> > > 
> > >   #define sum(x, y) x + y
> > > 
> > >   int n = 2 * sum (3, 5);
> > > 
> > > I'm not very familiar with this area of the parser but I would
> > > expect it to be relatively straightforward to extend your solution
> > > to handle this problem as well.
> > 
> > It'd certainly be useful to warn here.  But it doesn't seem to be an
> > easy warning to implement, to me.  E.g. warning for
> > 
> >   int n = 2 + sum (3, 5);
> > 
> > would be annoying, I suspect.
> 
> Yes, it would be if the warning was only meant to trigger for
> uses that change the meaning.  If it was meant to warn on unsafe
> macro definitions instead it should be less so.  But that would
> make it a different warning, and mean implementing it in the
> preprocessor.  Hmm, I guess it's not as straightforward as it
> seemed.

Nothing ever is ;).

All right, here's the third version of this patch which also warns about
the switch case Martin pointed out above.

Bootstrapped/regtested on x86_64-linux, ok for trunk?

2017-06-07  Marek Polacek  

PR c/80116
* c-common.h (warn_for_multiline_expansion): Declare.
* c-warn.c (warn_for_multiline_expansion): New function.
* c.opt (Wmultiline-expansion): New option.

* c-parser.c (c_parser_if_body): Set the location of the
body of the conditional after parsing all the labels.  Call
warn_for_multiline_expansion.
(c_parser_else_body): Likewise.
(c_parser_switch_statement): Likewise.
(c_parser_while_statement): Likewise.
(c_parser_for_statement): Likewise.
(c_parser_statement): Add a default argument.  Save the location
after labels have been parsed.
(c_parser_c99_block_statement): Likewise.

* parser.c (cp_parser_statement): Add a default argument.  Save the
location of the expression-statement after labels have been parsed.
(cp_parser_implicitly_scoped_statement): Set the location of the
body of the conditional after parsing all the labels.  Call
warn_for_multiline_expansion.
(cp_parser_already_scoped_statement): Likewise.

* doc/invoke.texi: Document -Wmultiline-expansion.

* c-c++-common/Wmultiline-expansion-1.c: New test.
* c-c++-common/Wmultiline-expansion-2.c: New test.
* c-c++-common/Wmultiline-expansion-3.c: New test.
* c-c++-common/Wmultiline-expansion-4.c: New test.
* c-c++-common/Wmultiline-expansion-5.c: New test.
* c-c++-common/Wmultiline-expansion-6.c: New test.
* c-c++-common/Wmultiline-expansion-7.c: New test.
* c-c++-common/Wmultiline-expansion-8.c: New test.
* c-c++-common/Wmultiline-expansion-9.c: New test.
* c-c++-common/Wmultiline-expansion-10.c: New test.

diff --git gcc/c-family/c-common.h

Re: [PATCH] multiarch support for non-glibc linux systems

2017-06-07 Thread Bernhard Reutner-Fischer

On 7 June 2017 19:22:43 CEST, Szabolcs Nagy  wrote:
>Current multiarch directory name is always *-linux-gnu* on linux,
>this patch configures different names for uclibc and musl targets.
>(tested by the debian rebootstrap scripts for various *-linux-musl
>and *-linux-uclibc targets see debian bug #861588)

FWIW my version since ages uses $libc but in effect is identical, yes.

Cheers,
>
>gcc/
>2017-06-07  Szabolcs Nagy  
>
>   * config.gcc (*-linux-musl*): Add t-musl tmake_file.
>   (*-linux-uclibc*): Add t-uclibc tmake_file.
>   * config/t-musl: New.
>   * config/t-uclibc: New.

Re: [PATCH] handle bzero/bcopy in DSE and aliasing (PR 80933, 80934)

2017-06-07 Thread Bernhard Reutner-Fischer

On 7 June 2017 16:46:53 CEST, Martin Sebor  wrote:
>On 06/07/2017 02:23 AM, Richard Biener wrote:
>> On Wed, Jun 7, 2017 at 5:26 AM, Martin Sebor 
>wrote:
 Note I'd be _much_ more sympathetic to simply canonicalizing all of
 bzero and bcopy
 to memset / memmove and be done with all the above complexity.
>>>
>>>
>>> Attached is an updated patch along these lines.  Please let me
>>> know if it matches your expectations.
>>
>> I think you attached the wrong patch.
>
>Yes I did, sorry.  The correct one is attached.

Under POSIX.1-2008 "optimizing" bzero or bcmp is IMO plain wrong.

It's like optimizing foo() to a random built-in but maybe that's just me. If 
your libc provides a define to a standard function for these under a compat 
knob then fine but otherwise you should fix that. *shrug*. Joseph?

thanks,
>
>Martin
>
>>
>> Richard.
>>
>>> FWIW, although I don't feel too strongly about bzero et al. I'm
>>> not sure that this approach is the right one in general.  It might
>>> (slightly) simplify GCC itself, but other than the incidental code
>>> generation improvement, it offers no benefit to users.  In some
>>> cases, it even degrades user experience by causing GCC issue
>>> diagnostics that refer to functions that don't appear in the source
>>> code, such as for:
>>>
>>>   char d[1];
>>>
>>>   void* f (const void *p)
>>>   {
>>> bzero (d, 7);
>>>   }
>>>
>>>   warning: ‘__builtin_memset’ writing 7 bytes into a region of size
>1
>>> overflows the destination [-Wstringop-overflow=]
>>>
>>> For some functions like mempcpy it might even worse code overall
>>> (slower and bigger).
>>>
>>> In other cases (like profiling) it loses interesting information.
>>>
>>> I think these types of transformations would be justified  f they
>>> were done based on measurably improved efficiency of the generated
>>> code, but I'm uneasy about swapping calls to one function for
>another
>>> solely because it simplifies the implementation.  Not least because
>>> it doesn't seem like a viable general approach to simplifying the
>>> implementation.
>>>
>>> Martin
>>>
>>> PS I stopped short of simplifying GCC to remove the existing special
>>> handling of these three built-ins.  If the patch is approved I'm
>>> willing to do the cleanup in a subsequent pass.

Re: [PATCH] libgo: fix support for ia64

2017-06-07 Thread Ian Lance Taylor

On Tue, Jun 6, 2017 at 6:11 AM, Andreas Schwab  wrote:
> This adds support for ia64 in lfstack.

Thanks.

Committed to mainline.

Ian


> diff --git a/libgo/go/runtime/lfstack_64bit.go 
> b/libgo/go/runtime/lfstack_64bit.go
> index b314a3ba21..99dcec02de 100644
> --- a/libgo/go/runtime/lfstack_64bit.go
> +++ b/libgo/go/runtime/lfstack_64bit.go
> @@ -2,7 +2,7 @@
>  // Use of this source code is governed by a BSD-style
>  // license that can be found in the LICENSE file.
>
> -// +build amd64 arm64 mips64 mips64le ppc64 ppc64le s390x arm64be alpha 
> mipsn64 sparc64
> +// +build amd64 arm64 mips64 mips64le ppc64 ppc64le s390x arm64be alpha 
> mipsn64 sparc64 ia64
>
>  package runtime
>
> @@ -38,12 +38,22 @@ const (
> // room in the bottom for the count.
> sparcLinuxAddrBits = 52
> sparcLinuxCntBits  = 64 - sparcLinuxAddrBits + 3
> +
> +   // On IA64, the virtual address space is devided into 8 regions, with
> +   // 52 address bits each (with 64k page size).
> +   ia64AddrBits = 55
> +   ia64CntBits = 64 - ia64AddrBits + 3
>  )
>
>  func lfstackPack(node *lfnode, cnt uintptr) uint64 {
> if GOARCH == "sparc64" && GOOS == "linux" {
> return 
> uint64(uintptr(unsafe.Pointer(node)))<<(64-sparcLinuxAddrBits) | 
> uint64(cnt&(1< }
> +   if GOARCH == "ia64" {
> +   // Top three bits are the region number
> +   val := uint64(uintptr(unsafe.Pointer(node)))
> +   return (val<<(64-ia64AddrBits))&(1<<(64-3)-1) | 
> val&^(1<<(64-3)-1) | uint64(cnt&(1< +   }
> return uint64(uintptr(unsafe.Pointer(node)))<<(64-addrBits) | 
> uint64(cnt&(1<  }
>
> @@ -56,5 +66,8 @@ func lfstackUnpack(val uint64) *lfnode {
> if GOARCH == "sparc64" && GOOS == "linux" {
> return (*lfnode)(unsafe.Pointer(uintptr(int64(val) >> 
> sparcLinuxCntBits << 3)))
> }
> +   if GOARCH == "ia64" {
> +   return (*lfnode)(unsafe.Pointer(uintptr((val >> ia64CntBits 
> << 3)&(1<<(64-3)-1) | val&^(1<<(64-3)-1
> +   }
> return (*lfnode)(unsafe.Pointer(uintptr(val >> cntBits << 3)))
>  }
> --

[PATCH] multiarch support for non-glibc linux systems

2017-06-07 Thread Szabolcs Nagy

Current multiarch directory name is always *-linux-gnu* on linux,
this patch configures different names for uclibc and musl targets.
(tested by the debian rebootstrap scripts for various *-linux-musl
and *-linux-uclibc targets see debian bug #861588)

gcc/
2017-06-07  Szabolcs Nagy  

* config.gcc (*-linux-musl*): Add t-musl tmake_file.
(*-linux-uclibc*): Add t-uclibc tmake_file.
* config/t-musl: New.
* config/t-uclibc: New.
diff --git a/gcc/config.gcc b/gcc/config.gcc
index a311cd95f49..fb7b7cd6d4c 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -3096,6 +3096,16 @@ powerpc*-*-* | rs6000-*-*)
 	tm_file="${tm_file} ${cpu_type}/option-defaults.h"
 esac
 
+# non-glibc systems
+case ${target} in
+*-linux-musl*)
+	tmake_file="${tmake_file} t-musl"
+	;;
+*-linux-uclibc*)
+	tmake_file="${tmake_file} t-uclibc"
+	;;
+esac
+
 # Build mkoffload tool
 case ${target} in
 *-intelmic-* | *-intelmicemul-*)
diff --git a/gcc/config/t-musl b/gcc/config/t-musl
new file mode 100644
index 000..e203fce7619
--- /dev/null
+++ b/gcc/config/t-musl
@@ -0,0 +1,2 @@
+MULTIARCH_DIRNAME := $(subst -linux-gnu,-linux-musl,$(MULTIARCH_DIRNAME))
+MULTILIB_OSDIRNAMES := $(subst -linux-gnu,-linux-musl,$(MULTILIB_OSDIRNAMES))
diff --git a/gcc/config/t-uclibc b/gcc/config/t-uclibc
new file mode 100644
index 000..c9b3a7bdfe2
--- /dev/null
+++ b/gcc/config/t-uclibc
@@ -0,0 +1,2 @@
+MULTIARCH_DIRNAME := $(subst -linux-gnu,-linux-uclibc,$(MULTIARCH_DIRNAME))
+MULTILIB_OSDIRNAMES := $(subst -linux-gnu,-linux-uclibc,$(MULTILIB_OSDIRNAMES))

Re: [PATCH/AARCH64] Improve/correct ThunderX 1 cost model for Arith_shift

2017-06-07 Thread James Greenhalgh

On Fri, Dec 30, 2016 at 10:05:26PM -0800, Andrew Pinski wrote:
> Hi,
>   Currently for the following function:
> int f(int a, int b)
> {
>   return a + (b <<7);
> }
> 
> GCC produces:
> add w0, w0, w1, lsl 7
> But for ThunderX 1, it is better if the instruction was split allowing
> better scheduling to happen in most cases, the latency is the same.  I
> get a small improvement in coremarks, ~1%.
> 
> Currently the code does not take into account Arith_shift even though
> the comment:
>   /* Strip any extend, leave shifts behind as we will
> cost them through mult_cost.  */
> Say it does not strip out the shift, aarch64_strip_extend does and has
> always has since the back-end was added to GCC.
> 
> Once I fixed the code around aarch64_strip_extend, I got a regression
> for ThunderX 1 as some shifts/extends (left shifts <=4 and/or zero
> extends) are considered free so I needed to add a new tuning flag.
> 
> Note I will get an even more improvement for ThunderX 2 CN99XX, but I
> have not measured it yet as I have not made the change to
> aarch64-cost-tables.h yet as I am waiting for approval of the renaming
> patch first before submitting any of the cost table changes.  Also I
> noticed this problem with this tuning first and then looked back at
> what I needed to do for ThunderX 1.
> 
> OK?  Bootstrapped and tested on aarch64-linux-gnu without any
> regressions (both with and without --with-cpu=thunderx).

This is mostly OK, but I don't like the name "easy"_shift_extend. Cheap
or free seems better. I have some other minor points below.

> Index: config/aarch64/aarch64-tuning-flags.def
> ===
> --- config/aarch64/aarch64-tuning-flags.def   (revision 243974)
> +++ config/aarch64/aarch64-tuning-flags.def   (working copy)
> @@ -35,4 +35,8 @@ two load/stores are not at least 8 byte
>  pairs.   */
>  AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW)
>  
> +/* Logical shift left <=4 with/without zero extend are considered easy
> +   extended, also zero extends without the shift. */


I'm struggling to parse this comment. "also zero extends without the shift"
is what is getting me. I'm also not certain I follow when I should set this
flag. If all shifts are cheap/free on my platform, should I set this flag?

> +AARCH64_EXTRA_TUNING_OPTION ("easy_shift_extend", EASY_SHIFT_EXTEND)
> +
>  #undef AARCH64_EXTRA_TUNING_OPTION


> +
> +/* Return true iff X is an easy shift without a sign extend. */
> +

Again I don't like calling <= 4 "easy", it feels imprecise.

Thanks,
James

Re: [PATCH] PR libstdc++/81002 fix std::basic_regex range constructor

2017-06-07 Thread Jonathan Wakely


On 07/06/17 16:45 +0100, Jonathan Wakely wrote:

Apparently std::basic_regex construction from forward iterators has
never worked, because the call to __compile_nfa doesn't give the
traits type. This reorders the template parameters so that only the
traits type needs to be given explicitly, and the iterator type can be
deduced.

PR libstdc++/81002
* include/bits/regex.h (basic_regex): Adjust call to __compile_nfa
so iterator type is deduced.
* include/bits/regex_compiler.h (__compile_nfa): Reorder template
parameters to allow iterator type to be deduced.
* testsuite/28_regex/basic_regex/ctors/basic/iter.cc: New.

Tested powerpc64le-linux, committed to trunk.

I'll commit a simpler change to the branches.


Here's the patch for the branches. This doesn't reorder the template
parameters, just adds the missing template argument list to the call.

Tested x86_64-linux, committed to gcc-7-branch, gcc-6-branch and
gcc-5-branch.

commit 9181bcbd96c1efd5f9767104064f21f7a0c78457
Author: Jonathan Wakely 
Date:   Wed Jun 7 16:49:33 2017 +0100

PR libstdc++/81002 fix std::basic_regex range constructor

	PR libstdc++/81002
	* include/bits/regex_compiler.h (__compile_nfa): Add template argument
	list to specify traits type.
	* testsuite/28_regex/basic_regex/ctors/basic/iter.cc: New.

diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h
index 49c0184..20f72fa 100644
--- a/libstdc++-v3/include/bits/regex_compiler.h
+++ b/libstdc++-v3/include/bits/regex_compiler.h
@@ -209,9 +209,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 		  const typename _TraitsT::locale_type& __loc,
 		  regex_constants::syntax_option_type __flags)
 {
-  basic_string __str(__first, __last);
-  return __compile_nfa(__str.data(), __str.data() + __str.size(), __loc,
-  __flags);
+  using char_type = typename _TraitsT::char_type;
+  const basic_string __str(__first, __last);
+  return __compile_nfa(__str.data(),
+	  __str.data() + __str.size(), __loc, __flags);
 }
 
   // [28.13.14]
diff --git a/libstdc++-v3/testsuite/28_regex/basic_regex/ctors/basic/iter.cc b/libstdc++-v3/testsuite/28_regex/basic_regex/ctors/basic/iter.cc
new file mode 100644
index 000..7776c5f
--- /dev/null
+++ b/libstdc++-v3/testsuite/28_regex/basic_regex/ctors/basic/iter.cc
@@ -0,0 +1,30 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// .
+
+// { dg-do compile { target c++11 } }
+
+#include 
+#include 
+
+void
+test01()
+{
+  char s[] = "";
+  __gnu_test::test_container c(s);
+  std::regex r1(c.begin(), c.end());
+  std::regex r2(c.begin(), c.end(), std::regex_constants::grep);
+}

Re: [PING][PATCH][ARM]Use different startfile and endfile for elf target when generating shared object.

2017-06-07 Thread Renlin Li


Ping ~

On 14/12/16 15:33, Renlin Li wrote:

Ping~

Regards,
Renlin

On 16/06/16 12:04, Renlin Li wrote:

Hi all,

GCC has startfile and endfile spec string built into it.
startfile is used to specify objects files to include at the start of the link 
process.
While endfile, on the other hand, is used to specify objects files to include 
at the end
of the link process.

crtbegin.o is one of the object files specified by startfile spec string. IIUC,
crtbeginS.o should be used in place of crtbegin.o when generating shared 
objects.
The same applies to crtend.o which is one of the endfile. crtendS.o should be 
used when
generating shared objects.

This patch makes the change to use different crtbegin and crtend files when 
creating
shared and static object for elf toolchain. The linux toolchain already did this
differentiation.

So when the toolchain doesn't support shared object, the following error 
message will be
produced:
ld: cannot find crtbeginS.o: No such file or directory

Still, those specs strings built into GCC can be overridden by using
-specs=command-line switch to specify a spec file.

arm-none-eabi regression test without new issues, OK for trunk?

Regards,
Renlin Li

gcc/ChangeLog:

2016-06-16  Renlin Li  

 * config/arm/unknown-elf.h (UNKNOWN_ELF_STARTFILE_SPEC): Use
 crtbeginS.o for shared object.
 (UNKNOWN_ELF_ENDFILE_SPEC): Use crtendS.o for shared object.

Re: [PATCH] gcov: Add block_info::block_info (PR gcov-profile/80911).

2017-06-07 Thread Jan Hubicka

> 
> 2017-05-29  Martin Liska  
> 
>   PR gcov-profile/80911
>   * gcov.c (block_info::block_info): New constructor.

OK, thanks!
Honza
> ---
>  gcc/gcov.c | 11 +++
>  1 file changed, 11 insertions(+)
> 
> diff --git a/gcc/gcov.c b/gcc/gcov.c
> index a5aa4aadcac..e324cadad82 100644
> --- a/gcc/gcov.c
> +++ b/gcc/gcov.c
> @@ -132,6 +132,9 @@ struct block_location_info
>  
>  typedef struct block_info
>  {
> +  /* Constructor.  */
> +  block_info ();
> +
>/* Chain of exit and entry arcs.  */
>arc_t *succ;
>arc_t *pred;
> @@ -173,6 +176,14 @@ typedef struct block_info
>  
>  } block_t;
>  
> +block_info::block_info (): succ (NULL), pred (NULL), num_succ (0), num_pred 
> (0),
> +  id (0), count (0), count_valid (0), valid_chain (0), invalid_chain (0),
> +  exceptional (0), is_call_site (0), is_call_return (0), is_nonlocal_return 
> (0),
> +  locations (), chain (NULL)
> +{
> +  cycle.arc = NULL;
> +}
> +
>  /* Describes a single function. Contains an array of basic blocks.  */
>  
>  typedef struct function_info
> -- 
> 2.12.2
>

Re: Fix profile updating in cfgbuild

2017-06-07 Thread Jan Hubicka

> >   {
> >-bb->count += e->count;
> >+if (e->count.initialized_p ())
> >+  {
> >+bb->count += e->count;
> >+initialized_src = true;
> >+  }
> >+else
> >+  uninitialized_src = false;
> 
> false?
> Please explain false respectively the initializer (false too) in the light of 
> the condition in the hunk below?

This is a thinko. It was meant to be true.  I am testing the fix.

Thanks!
Honza
> 
> thanks,
> 
> > bb->frequency += EDGE_FREQUENCY (e);
> >   }
> >+/* When some edges are missing with read profile, this is
> >+   most likely because RTL expansion introduced loop.
> >+   When profile is guessed we may have BB that is reachable
> >+   from unlikely path as well as from normal path.
> >+
> >+   TODO: We should handle loops created during BB expansion
> >+   correctly here.  For now we assume all those loop to cycle
> >+   precisely once.  */
> >+if (!initialized_src
> >+|| (uninitialized_src
> >+ && profile_status_for_fn (cfun) != PROFILE_READ))
> >+  bb->count = profile_count::uninitialized ();
> >   }
> >+else
> >+  /* If nothing changed, there is no need to create new BBs.  */
> >+  if (EDGE_COUNT (bb->succs) == n_succs[bb->index])
> >+continue;
> > 
> > compute_outgoing_frequencies (bb);
> >   }

Re: [PATCH v2] Generate reproducible output independently of the build-path

2017-06-07 Thread Ximin Luo

Ximin Luo:
> Joseph Myers:
>> On Tue, 11 Apr 2017, Ximin Luo wrote:
>>
>>> Copyright disclaimer
>>> 
>>>
>>> I dedicate these patches to the public domain by waiving all of my rights to
>>> the work worldwide under copyright law, including all related and 
>>> neighboring
>>> rights, to the extent allowed by law.
>>>
>>> See https://creativecommons.org/publicdomain/zero/1.0/legalcode for full 
>>> text.
>>>
>>> Please let me know if the above is insufficient and I will be happy to sign 
>>> any
>>> relevant forms.
>>
>> I believe the FSF wants its own disclaimer forms signed as evidence code 
>> is in the public domain.  The process for getting disclaimer forms is to 
>> complete 
>> https://git.savannah.gnu.org/cgit/gnulib.git/plain/doc/Copyright/request-disclaim.changes
>>  
>> and then you should be sent a disclaimer form for disclaiming the 
>> particular set of changes you have completed (if you then make further 
>> significant changes afterwards, the disclaimer form would then need 
>> completing for them as well).
>>
> 
> I've now done this, and the copyright clerk at the FSF has told me that this 
> is complete on their side as well.
> 
> Did any of you get a chance to look at the patch yet?
> 

Hi GCC patches list,

Any progress or feedback on this patch series?

Ximin

-- 
GPG: ed25519/56034877E1F87C35
GPG: rsa4096/1318EFAC5FBBDBCE
https://github.com/infinity0/pubkeys.git

Re: Fix profile updating in cfgbuild

2017-06-07 Thread Bernhard Reutner-Fischer

On 7 June 2017 08:44:13 CEST, Jan Hubicka  wrote:
>Hi
>the following patch makes cfgbuild to preserve profile when loops are
>introduced at RTL level (not very well, but at least do not throw it
>all
>away) and also avoids re-computing probabilities when there are no
>changes to CFG.
>
>Bootstrapped/regtested x86_64-linux. Comitted.
>
>Honza
>
>Index: cfgbuild.c
>===
>--- cfgbuild.c (revision 248915)
>+++ cfgbuild.c (working copy)
>@@ -475,6 +475,10 @@ find_bb_boundaries (basic_block bb)
> 
> bb = fallthru->dest;
> remove_edge (fallthru);
>+/* BB is unreachable at this point - we need to determine its
>profile
>+   once edges are built.  */
>+bb->frequency = 0;
>+bb->count = profile_count::uninitialized ();
> flow_transfer_insn = NULL;
> if (code == CODE_LABEL && LABEL_ALT_ENTRY_P (insn))
>   make_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun), bb, 0);
>@@ -577,7 +581,7 @@ compute_outgoing_frequencies (basic_bloc
> guess_outgoing_edge_probabilities (b);
> }
> 
>-  if (b->count > profile_count::zero ())
>+  if (b->count.initialized_p ())
> FOR_EACH_EDGE (e, ei, b->succs)
>   e->count = b->count.apply_probability (e->probability);
> }
>@@ -590,6 +594,9 @@ void
> find_many_sub_basic_blocks (sbitmap blocks)
> {
>   basic_block bb, min, max;
>+  bool found = false;
>+  auto_vec n_succs;
>+  n_succs.safe_grow_cleared (last_basic_block_for_fn (cfun));
> 
>   FOR_EACH_BB_FN (bb, cfun)
> SET_STATE (bb,
>@@ -597,11 +604,24 @@ find_many_sub_basic_blocks (sbitmap bloc
> 
>   FOR_EACH_BB_FN (bb, cfun)
> if (STATE (bb) == BLOCK_TO_SPLIT)
>-  find_bb_boundaries (bb);
>+  {
>+  int n = last_basic_block_for_fn (cfun);
>+  unsigned int ns = EDGE_COUNT (bb->succs);
>+
>+find_bb_boundaries (bb);
>+  if (n == last_basic_block_for_fn (cfun) && ns == EDGE_COUNT
>(bb->succs))
>+n_succs[bb->index] = EDGE_COUNT (bb->succs);
>+  }
> 
>   FOR_EACH_BB_FN (bb, cfun)
> if (STATE (bb) != BLOCK_ORIGINAL)
>-  break;
>+  {
>+  found = true;
>+break;
>+  }
>+
>+  if (!found)
>+return;
> 
>   min = max = bb;
>   for (; bb != EXIT_BLOCK_PTR_FOR_FN (cfun); bb = bb->next_bb)
>@@ -624,14 +644,37 @@ find_many_sub_basic_blocks (sbitmap bloc
> continue;
>   if (STATE (bb) == BLOCK_NEW)
> {
>+  bool initialized_src = false, uninitialized_src = false;
>   bb->count = profile_count::zero ();
>   bb->frequency = 0;
>   FOR_EACH_EDGE (e, ei, bb->preds)
> {
>-  bb->count += e->count;
>+  if (e->count.initialized_p ())
>+{
>+  bb->count += e->count;
>+  initialized_src = true;
>+}
>+  else
>+uninitialized_src = false;

false?
Please explain false respectively the initializer (false too) in the light of 
the condition in the hunk below?

thanks,

>   bb->frequency += EDGE_FREQUENCY (e);
> }
>+  /* When some edges are missing with read profile, this is
>+ most likely because RTL expansion introduced loop.
>+ When profile is guessed we may have BB that is reachable
>+ from unlikely path as well as from normal path.
>+
>+ TODO: We should handle loops created during BB expansion
>+ correctly here.  For now we assume all those loop to cycle
>+ precisely once.  */
>+  if (!initialized_src
>+  || (uninitialized_src
>+   && profile_status_for_fn (cfun) != PROFILE_READ))
>+bb->count = profile_count::uninitialized ();
> }
>+  else
>+/* If nothing changed, there is no need to create new BBs.  */
>+if (EDGE_COUNT (bb->succs) == n_succs[bb->index])
>+  continue;
> 
>   compute_outgoing_frequencies (bb);
>   }

[PATCH] Add C++17 deduction guide for std::basic_regex (P0433R2, partial)

2017-06-07 Thread Jonathan Wakely


C++17 deduction guide for std::basic_regex.

* include/bits/regex.h (basic_regex): Add deduction guide from P0433.
* testsuite/28_regex/basic_regex/ctors/deduction.cc: New.

Tested powerpc64le-linux, committed to trunk.

commit 9f523df8e7338b52277b6f4942b85cb75de1cb6a
Author: Jonathan Wakely 
Date:   Wed Jun 7 14:28:58 2017 +0100

Add C++17 deduction guide for std::basic_regex (P0433R2, partial)

* include/bits/regex.h (basic_regex): Add deduction guide from P0433.
* testsuite/28_regex/basic_regex/ctors/deduction.cc: New.

diff --git a/libstdc++-v3/include/bits/regex.h 
b/libstdc++-v3/include/bits/regex.h
index 1710db9..0bb88cb 100644
--- a/libstdc++-v3/include/bits/regex.h
+++ b/libstdc++-v3/include/bits/regex.h
@@ -787,6 +787,13 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
   _AutomatonPtr  _M_automaton;
 };
 
+#if __cpp_deduction_guides >= 201606
+  template
+basic_regex(_ForwardIterator, _ForwardIterator,
+   regex_constants::syntax_option_type = {})
+  -> basic_regex::value_type>;
+#endif
+
   /** @brief Standard regular expressions. */
   typedef basic_regexregex;
 
diff --git a/libstdc++-v3/testsuite/28_regex/basic_regex/ctors/deduction.cc 
b/libstdc++-v3/testsuite/28_regex/basic_regex/ctors/deduction.cc
new file mode 100644
index 000..63b3f67
--- /dev/null
+++ b/libstdc++-v3/testsuite/28_regex/basic_regex/ctors/deduction.cc
@@ -0,0 +1,61 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// .
+
+// { dg-options "-std=gnu++17" }
+// { dg-do compile { target c++1z } }
+
+#include 
+#include 
+
+template struct require_same;
+template struct require_same { using type = void; };
+
+template
+  typename require_same::type
+  check_type(U&) { }
+
+void
+test01()
+{
+  std::basic_regex x("");
+  check_type(x);
+  char s[1] = {};
+  std::basic_regex x2(s);
+  check_type(x2);
+  std::basic_regex x3(U"");
+  check_type(x3);
+  std::basic_regex x4(U"", std::regex_constants::grep);
+  check_type(x4);
+
+  // Test explicit guide:
+  std::basic_regex x5(s, s+1);
+  check_type(x5);
+  std::basic_regex x6((const char*)s, (const char*)s+1);
+  check_type(x6);
+  std::basic_regex x7(s, s+1, std::regex_constants::grep);
+  check_type(x7);
+  __gnu_test::test_container f(s);
+  std::basic_regex x8(f.begin(), f.end());
+  check_type(x8);
+  std::basic_regex x9(f.begin(), f.end(), std::regex_constants::grep);
+  check_type(x9);
+
+  std::basic_regex copy = x;
+  check_type(copy);
+  std::basic_regex move = std::move(x);
+  check_type(move);
+}

[PATCH] PR libstdc++/81002 fix std::basic_regex range constructor

2017-06-07 Thread Jonathan Wakely


Apparently std::basic_regex construction from forward iterators has
never worked, because the call to __compile_nfa doesn't give the
traits type. This reorders the template parameters so that only the
traits type needs to be given explicitly, and the iterator type can be
deduced.

PR libstdc++/81002
* include/bits/regex.h (basic_regex): Adjust call to __compile_nfa
so iterator type is deduced.
* include/bits/regex_compiler.h (__compile_nfa): Reorder template
parameters to allow iterator type to be deduced.
* testsuite/28_regex/basic_regex/ctors/basic/iter.cc: New.

Tested powerpc64le-linux, committed to trunk.

I'll commit a simpler change to the branches.


commit a52761c07d0ac3eea7f1d0622995aab052ef4e31
Author: Jonathan Wakely 
Date:   Wed Jun 7 15:07:54 2017 +0100

PR libstdc++/81002 fix std::basic_regex range constructor

PR libstdc++/81002
* include/bits/regex.h (basic_regex): Adjust call to __compile_nfa
so iterator type is deduced.
* include/bits/regex_compiler.h (__compile_nfa): Reorder template
parameters to allow iterator type to be deduced.
* testsuite/28_regex/basic_regex/ctors/basic/iter.cc: New.

diff --git a/libstdc++-v3/include/bits/regex.h 
b/libstdc++-v3/include/bits/regex.h
index 15f0d08..1710db9 100644
--- a/libstdc++-v3/include/bits/regex.h
+++ b/libstdc++-v3/include/bits/regex.h
@@ -765,7 +765,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
: _M_flags((__f & (ECMAScript | basic | extended | awk | grep | egrep))
   ? __f : (__f | ECMAScript)),
_M_loc(std::move(__loc)),
-   _M_automaton(__detail::__compile_nfa<_FwdIter, _Rx_traits>(
+   _M_automaton(__detail::__compile_nfa<_Rx_traits>(
  std::move(__first), std::move(__last), _M_loc, _M_flags))
{ }
 
diff --git a/libstdc++-v3/include/bits/regex_compiler.h 
b/libstdc++-v3/include/bits/regex_compiler.h
index 49c0184..2c00939 100644
--- a/libstdc++-v3/include/bits/regex_compiler.h
+++ b/libstdc++-v3/include/bits/regex_compiler.h
@@ -191,7 +191,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   = typename enable_if< !__is_contiguous_normal_iter<_Iter>::value,
std::shared_ptr> >::type;
 
-  template
+  template
 inline __enable_if_contiguous_normal_iter<_FwdIter, _TraitsT>
 __compile_nfa(_FwdIter __first, _FwdIter __last,
  const typename _TraitsT::locale_type& __loc,
@@ -203,15 +203,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   return _Cmplr(__cfirst, __cfirst + __len, __loc, __flags)._M_get_nfa();
 }
 
-  template
+  template
 inline __disable_if_contiguous_normal_iter<_FwdIter, _TraitsT>
 __compile_nfa(_FwdIter __first, _FwdIter __last,
  const typename _TraitsT::locale_type& __loc,
  regex_constants::syntax_option_type __flags)
 {
-  basic_string __str(__first, __last);
-  return __compile_nfa(__str.data(), __str.data() + __str.size(), __loc,
-  __flags);
+  const basic_string __str(__first, __last);
+  return __compile_nfa<_TraitsT>(__str.data(), __str.data() + __str.size(),
+__loc, __flags);
 }
 
   // [28.13.14]
diff --git a/libstdc++-v3/testsuite/28_regex/basic_regex/ctors/basic/iter.cc 
b/libstdc++-v3/testsuite/28_regex/basic_regex/ctors/basic/iter.cc
new file mode 100644
index 000..7776c5f
--- /dev/null
+++ b/libstdc++-v3/testsuite/28_regex/basic_regex/ctors/basic/iter.cc
@@ -0,0 +1,30 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// .
+
+// { dg-do compile { target c++11 } }
+
+#include 
+#include 
+
+void
+test01()
+{
+  char s[] = "";
+  __gnu_test::test_container c(s);
+  std::regex r1(c.begin(), c.end());
+  std::regex r2(c.begin(), c.end(), std::regex_constants::grep);
+}

[PATCH, GCC/testsuite/ARM] Allow arm_arch_*_ok to test several macros

2017-06-07 Thread Thomas Preudhomme


Hi,

The general arm_arch_*_ok procedures check architecture availability by
substituting macros inside a defined preprocessor operator. This limits
them to only check definition of only one macro and force ARMv7VE to be
special cased.

This patch takes advantage of the fact that architecture macros, when
defined, are not null to allow expressing architecture availability by
a boolean operation of possibly several macros. It then takes advantage
of this to deal with ARMv7VE in the general case.  The patch also adds a
comment to make it clear that check_effective_target_arm_arch_FUNC_ok
does not work as intendend for architecture extensions (eg. ARMv8.1-A)
due to lack of extension-specific macro similar to __ARM_ARCH_*__.

ChangeLog entry is as follows:

*** gcc/testsuite/ChangeLog ***

2017-06-06  Thomas Preud'homme  

* lib/target-supports.exp (check_effective_target_arm_arch_FUNC_ok):
Test for null definitions instead of them being undefined.  Add entry
for ARMv7VE.  Reindent entry for ARMv8-M Baseline.  Add comment warning
about using the effective target for architecture extension.
(check_effective_target_arm_arch_v7ve_ok): Remove.
(add_options_for_arm_arch_v7ve): Likewise.

Testing:
- gcc.target/arm/atomic_loaddi_10.c passes with the patch for armv7ve
  but is marked unsupported for armv7-a
- verified in the logs that -march=armv7ve is correctly added when
  running gcc.target/arm/ftest-armv7ve-arm.c

Is this ok for trunk?

Best regards,

Thomas
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index ded6383cc1f9a1489cd83e1dace0c2fc48e252c3..e83ec757ae3c0dd7c3cad19cfd5d9577547d18a5 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3775,12 +3775,13 @@ proc check_effective_target_arm_fp16_hw { } {
 # can be selected and a routine to give the flags to select that architecture
 # Note: Extra flags may be added to disable options from newer compilers
 # (Thumb in particular - but others may be added in the future).
-# -march=armv7ve is special and is handled explicitly after this loop because
-# it needs more than one predefine check to identify.
+# Warning: Do not use check_effective_target_arm_arch_*_ok for architecture
+# extension (eg. ARMv8.1-A) since there is no macro defined for them.  See
+# how only __ARM_ARCH_8A__ is checked for ARMv8.1-A.
 # Usage: /* { dg-require-effective-target arm_arch_v5_ok } */
 #/* { dg-add-options arm_arch_v5 } */
 #	 /* { dg-require-effective-target arm_arch_v5_multilib } */
-foreach { armfunc armflag armdef } {
+foreach { armfunc armflag armdefs } {
 	v4 "-march=armv4 -marm" __ARM_ARCH_4__
 	v4t "-march=armv4t" __ARM_ARCH_4T__
 	v5 "-march=armv5 -marm" __ARM_ARCH_5__
@@ -3795,20 +3796,23 @@ foreach { armfunc armflag armdef } {
 	v7r "-march=armv7-r" __ARM_ARCH_7R__
 	v7m "-march=armv7-m -mthumb" __ARM_ARCH_7M__
 	v7em "-march=armv7e-m -mthumb" __ARM_ARCH_7EM__
+	v7ve "-march=armv7ve -marm"
+		"__ARM_ARCH_7A__ && __ARM_FEATURE_IDIV"
 	v8a "-march=armv8-a" __ARM_ARCH_8A__
 	v8_1a "-march=armv8.1a" __ARM_ARCH_8A__
 	v8_2a "-march=armv8.2a" __ARM_ARCH_8A__
-	v8m_base "-march=armv8-m.base -mthumb -mfloat-abi=soft" __ARM_ARCH_8M_BASE__
+	v8m_base "-march=armv8-m.base -mthumb -mfloat-abi=soft"
+		__ARM_ARCH_8M_BASE__
 	v8m_main "-march=armv8-m.main -mthumb" __ARM_ARCH_8M_MAIN__ } {
-eval [string map [list FUNC $armfunc FLAG $armflag DEF $armdef ] {
+eval [string map [list FUNC $armfunc FLAG $armflag DEFS $armdefs ] {
 	proc check_effective_target_arm_arch_FUNC_ok { } {
 	if { [ string match "*-marm*" "FLAG" ] &&
 		![check_effective_target_arm_arm_ok] } {
 		return 0
 	}
 	return [check_no_compiler_messages arm_arch_FUNC_ok assembly {
-		#if !defined (DEF)
-		#error !DEF
+		#if !(DEFS)
+		#error !(DEFS)
 		#endif
 	} "FLAG" ]
 	}
@@ -3829,26 +3833,6 @@ foreach { armfunc armflag armdef } {
 }]
 }
 
-# Same functions as above but for -march=armv7ve.  To uniquely identify
-# -march=armv7ve we need to check for __ARM_ARCH_7A__ as well as
-# __ARM_FEATURE_IDIV otherwise it aliases with armv7-a.
-
-proc check_effective_target_arm_arch_v7ve_ok { } {
-  if { [ string match "*-marm*" "-march=armv7ve" ] &&
-	![check_effective_target_arm_arm_ok] } {
-		return 0
-}
-  return [check_no_compiler_messages arm_arch_v7ve_ok assembly {
-  #if !defined (__ARM_ARCH_7A__) || !defined (__ARM_FEATURE_IDIV)
-  #error !armv7ve
-  #endif
-  } "-march=armv7ve" ]
-}
-
-proc add_options_for_arm_arch_v7ve { flags } {
-return "$flags -march=armv7ve"
-}
-
 # Return 1 if GCC was configured with --with-mode=
 proc check_effective_target_default_mode { } {

Re: [PATCH,AIX] Enable libiberty to read AIX XCOFF

2017-06-07 Thread DJ Delorie


"REIX, Tony"  writes:
> It appears that XNEWVEC() calls xmalloc which prints a message and
> calls xexit if malloc fails.

Objection removed then ;-)

> So, yes, we check if (strtab == NULL) though there is no way that
> XDELETEVEC(NULL) breaks something.  However, it is a classic
> programming style.

Yup, I noted that.  Just mentioning the inconsistency.

Re: [Patch, fortran] PR35339 Optimize implied do loops in io statements

2017-06-07 Thread Renlin Li

171.swim fails on aarch64-linux as well. I dis a bisect and confirm it's r248877 causing 
the miscompare.


Regards,
Renlin

On 06/06/17 12:05, Markus Trippelsdorf wrote:

On 2017.06.05 at 22:39 +0200, Nicolas Koenig wrote:

With all the style fixes committed as r248877.


171_swim fails now. I didn't bisect, but I suspect your revision.

Re: [PATCH][GCC][ARM] Adjust costs so udiv is preferred over sdiv when both are valid. [Patch (2/2)]

2017-06-07 Thread Tamar Christina

Hi Kyrill,

I have updated the patch and regtested on arm-none-linux-gnueabihf.

OK for trunk?

Thanks,
Tamar

From: Tamar Christina
Sent: Wednesday, June 7, 2017 11:15:49 AM
To: Kyrill Tkachov; GCC Patches
Cc: nd; Ramana Radhakrishnan; Richard Earnshaw; ni...@redhat.com
Subject: RE: [PATCH][GCC][ARM] Adjust costs so udiv is preferred over sdiv when 
both are valid. [Patch (2/2)]

Hi Kyrill,

> diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index
> b24143e32e2f10f3b150f7ed0df4fabb3cc8..ecc7688b1db6309a4dd694a8e
> 254e64abe14d7e3 100644
> --- a/gcc/config/arm/arm.c
> +++ b/gcc/config/arm/arm.c
> @@ -9258,6 +9258,8 @@ arm_rtx_costs_internal (rtx x, enum rtx_code
> code, enum rtx_code outer_code,
>   *cost += COSTS_N_INSNS (speed_p ? extra_cost->mult[0].idiv : 0);
> else
>   *cost = LIBCALL_COST (2);
> +
> +  *cost += (code == DIV ? 1 : 0);
> return false; /* All arguments must be in registers.  */
>
>
> We usually try to avoid adjusting the costs in units other than
> COSTS_N_INSNS.
> Would adding COSTS_N_INSNS (1) here work?
> If so, could you also add a comment here to describe why we're adjusting the
> cost.

It would, I'm just slightly worried it might end up generating different code 
for DIV then.
The reason I have used a unit smaller than COSTS_N_INSNS it so that it should 
have any real
Impact on any other optimization as the cost is likely treated as an integer? 
It's only for things that
Compare the costs values between signed and unsigned would the small unit make 
a difference.

Since I think the compiler still has some hard coded cost limits somewhere it 
may be an issue, but I'm not
100% certain. I can make the change though.

>
>   case MOD:
> @@ -9280,7 +9282,7 @@ arm_rtx_costs_internal (rtx x, enum rtx_code
> code, enum rtx_code outer_code,
>
>   /* Fall-through.  */
>   case UMOD:
> -  *cost = LIBCALL_COST (2);
> +  *cost = LIBCALL_COST (2) + (code == MOD ? 1 : 0);
>
> Same here.
>
> Thanks,
> Kyrill
>

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index b24143e32e2f10f3b150f7ed0df4fabb3cc8..442d12de4dcff50484229e2d27e65d78c3fd6b37 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -9258,6 +9258,10 @@ arm_rtx_costs_internal (rtx x, enum rtx_code code, enum rtx_code outer_code,
 	*cost += COSTS_N_INSNS (speed_p ? extra_cost->mult[0].idiv : 0);
   else
 	*cost = LIBCALL_COST (2);
+
+  /* Make the cost of sdiv more expensive so when both sdiv and udiv are
+	 possible udiv is prefered.  */
+  *cost += (code == DIV ? COSTS_N_INSNS (1) : 0);
   return false;	/* All arguments must be in registers.  */
 
 case MOD:
@@ -9280,7 +9284,9 @@ arm_rtx_costs_internal (rtx x, enum rtx_code code, enum rtx_code outer_code,
 
 /* Fall-through.  */
 case UMOD:
-  *cost = LIBCALL_COST (2);
+  /* Make the cost of sdiv more expensive so when both sdiv and udiv are
+	 possible udiv is prefered.  */
+  *cost = LIBCALL_COST (2) + (code == MOD ? COSTS_N_INSNS (1) : 0);
   return false;	/* All arguments must be in registers.  */
 
 case ROTATE:
diff --git a/gcc/testsuite/gcc.target/arm/sdiv_costs_1.c b/gcc/testsuite/gcc.target/arm/sdiv_costs_1.c
new file mode 100644
index ..76086ab9ce28fceb37a4e8a615a38923fa7b985a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/sdiv_costs_1.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8-a" } */
+
+/* Both sdiv and udiv can be used here, so prefer udiv.  */
+int f1 (unsigned char *p)
+{
+  return 100 / p[1];
+}
+
+int f2 (unsigned char *p, unsigned short x)
+{
+  return x / p[0];
+}
+
+int f3 (unsigned char *p, int x)
+{
+  x &= 0x7fff;
+  return x / p[0];
+}
+
+int f5 (unsigned char *p, unsigned short x)
+{
+  return x % p[0];
+}
+
+/* This should only generate signed divisions.  */
+int f4 (unsigned char *p)
+{
+  return -100 / p[1];
+}
+
+int f6 (unsigned char *p, short x)
+{
+  return x % p[0];
+}
+
+/* { dg-final { scan-assembler-times "udiv\tr\[0-9\]+, r\[0-9\]+, r\[0-9\]+" 4 } } */
+/* { dg-final { scan-assembler-times "sdiv\tr\[0-9\]+, r\[0-9\]+, r\[0-9\]+" 2 } } */

Re: [PATCH] handle bzero/bcopy in DSE and aliasing (PR 80933, 80934)

2017-06-07 Thread Martin Sebor


On 06/07/2017 02:23 AM, Richard Biener wrote:

On Wed, Jun 7, 2017 at 5:26 AM, Martin Sebor  wrote:

Note I'd be _much_ more sympathetic to simply canonicalizing all of
bzero and bcopy
to memset / memmove and be done with all the above complexity.



Attached is an updated patch along these lines.  Please let me
know if it matches your expectations.


I think you attached the wrong patch.


Yes I did, sorry.  The correct one is attached.

Martin



Richard.


FWIW, although I don't feel too strongly about bzero et al. I'm
not sure that this approach is the right one in general.  It might
(slightly) simplify GCC itself, but other than the incidental code
generation improvement, it offers no benefit to users.  In some
cases, it even degrades user experience by causing GCC issue
diagnostics that refer to functions that don't appear in the source
code, such as for:

  char d[1];

  void* f (const void *p)
  {
bzero (d, 7);
  }

  warning: ‘__builtin_memset’ writing 7 bytes into a region of size 1
overflows the destination [-Wstringop-overflow=]

For some functions like mempcpy it might even worse code overall
(slower and bigger).

In other cases (like profiling) it loses interesting information.

I think these types of transformations would be justified  f they
were done based on measurably improved efficiency of the generated
code, but I'm uneasy about swapping calls to one function for another
solely because it simplifies the implementation.  Not least because
it doesn't seem like a viable general approach to simplifying the
implementation.

Martin

PS I stopped short of simplifying GCC to remove the existing special
handling of these three built-ins.  If the patch is approved I'm
willing to do the cleanup in a subsequent pass.


PR tree-optimization/80934 - bzero should be assumed not to escape pointer argument
PR tree-optimization/80933 - redundant bzero/bcopy calls not eliminated

gcc/ChangeLog:

	PR tree-optimization/80933
	PR tree-optimization/80934
	* builtins.c (fold_builtin_bcmp, fold_builtin_bcopy): New functions.
	(fold_builtin_bzero): Likewise.
	(fold_builtin_2): Handle bzero.
	(fold_builtin_3): Handle bcmp and bcpy.

gcc/testsuite/ChangeLog:

	PR tree-optimization/80933
	PR tree-optimization/80934
	* gcc.dg/fold-bcopy.c: New test.
	* gcc.dg/tree-ssa/ssa-dse-30.c: Likewise..
	* gcc.dg/tree-ssa/alias-36.c: Likewise.
	* gcc/testsuite/gcc.dg/pr79214.c: Adjust.

diff --git a/gcc/builtins.c b/gcc/builtins.c
index 30462ad..c6a2ec5 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -145,6 +145,9 @@ static rtx expand_builtin_unop (machine_mode, tree, rtx, rtx, optab);
 static rtx expand_builtin_frame_address (tree, tree);
 static tree stabilize_va_list_loc (location_t, tree, int);
 static rtx expand_builtin_expect (tree, rtx);
+static tree fold_builtin_bcmp (location_t, tree, tree, tree);
+static tree fold_builtin_bcopy (location_t, tree, tree, tree);
+static tree fold_builtin_bzero (location_t, tree, tree);
 static tree fold_builtin_constant_p (tree);
 static tree fold_builtin_classify_type (tree);
 static tree fold_builtin_strlen (location_t, tree, tree);
@@ -7982,6 +7985,53 @@ fold_builtin_sincos (location_t loc,
 			 fold_build1_loc (loc, REALPART_EXPR, type, call)));
 }
 
+/* Fold function call to built-in bzero with arguments SRC and LEN
+   into a call to built-in memset (DST, 0, LEN).  */
+
+static tree
+fold_builtin_bzero (location_t loc, tree dst, tree len)
+{
+  if (!validate_arg (dst, POINTER_TYPE)
+  || !validate_arg (len, INTEGER_TYPE))
+return NULL_TREE;
+
+  tree fn = builtin_decl_implicit (BUILT_IN_MEMSET);
+  return build_call_expr_loc (loc, fn, 3, dst, integer_zero_node, len);
+}
+
+/* Fold function call to built-in bcmp with arguments ARG1, ARG2, and LEN
+   into a call to built-in memcmp(ARG1, ARG2, LEN).  */
+
+static tree
+fold_builtin_bcmp (location_t loc, tree arg1, tree arg2, tree len)
+{
+  if (tree ret = fold_builtin_memcmp (loc, arg1, arg2, len))
+return ret;
+
+  if (!validate_arg (arg1, POINTER_TYPE)
+  || !validate_arg (arg2, POINTER_TYPE)
+  || !validate_arg (len, INTEGER_TYPE))
+return NULL_TREE;
+
+  tree fn = builtin_decl_implicit (BUILT_IN_MEMCMP);
+  return build_call_expr_loc (loc, fn, 3, arg1, arg2, len);
+}
+
+/* Fold function call to built-in bcopy with arguments SRC, DST, and LEN
+   into a call to built-in memcpy(DST, SRC, LEN).  */
+
+static tree
+fold_builtin_bcopy (location_t loc, tree src, tree dst, tree len)
+{
+  if (!validate_arg (src, POINTER_TYPE)
+  || !validate_arg (dst, POINTER_TYPE)
+  || !validate_arg (len, INTEGER_TYPE))
+return NULL_TREE;
+
+  tree fn = builtin_decl_implicit (BUILT_IN_MEMCPY);
+  return build_call_expr_loc (loc, fn, 3, dst, src, len);
+}
+
 /* Fold function call to builtin memcmp with arguments ARG1 and ARG2.
Return NULL_TREE if no simplification can be made.  */
 
@@ -8947,6 +8997,9 @@ fold_builtin_2 (location_t loc, tree fndecl, tree arg0, tree arg1)

Re: [PATCH,AIX] Enable libiberty to read AIX XCOFF

2017-06-07 Thread David Edelsohn

On Wed, Jun 7, 2017 at 10:22 AM, REIX, Tony  wrote:
> Hi David,
>
> I'll fix the code incorrectly indented.

I already have fixed the indentation in my copy.

>
> About your comment about our code looking for TEXT section by looking at 
> string ".text"  , please note that our patch fixes a file called 
> "simple-object-xcoff.c" : SIMPLE.
> Do not expect us to handle more than required.
>
> However, are you sure that  -ffunction-sections  is implemented on AIX ?

-ffunction-sections is implemented on AIX.  It is used quite
frequently for additional performance.  The libstdc++ library builds
with the option.

On second thought, his probably doesn't affect your code because the
implementation is working at the COFF section level, not the XCOFF
storage mapping class level.

>
> Moreover, if it is not implemented on AIX, don't you think that such an 
> option which is documented as:
> " Place each function or data item into its own section in the output file if 
> the
> target supports arbitrary sections. The name of the function or the name of
> the data item determines the section’s name in the output file.
> Use these options on systems where the linker can perform optimizations to
> improve locality of reference in the instruction space. Most systems using the
> ELF object format and SPARC processors running Solaris 2 have linkers with
> such optimizations. AIX may have these optimizations in the future.
> Only use these options when there are significant benefits from doing so. When
> you specify these options, the assembler and linker create larger object and
> executable files and are also slower."
> is not compatible with the already existing high complexity of GCC Go 
> implementation ?and should be forbidden with Go on AIX ?
>
>
> We have tried another approach:
> 127a128
>> #define STYP_TEXT 0x20
> 408a410
>>   unsigned int flags;
> 482a485,486
>> flags = fetch_32 (scnhdr + offsetof (struct external_scnhdr,
>>  u.xcoff64.s_flags));
> 489a494,495
>> flags = fetch_32 (scnhdr + offsetof (struct external_scnhdr,
>>  u.xcoff32.s_flags));
> 492c498
> <   if (strcmp (name, ".text") == 0)
> ---
>>   if ((flags & 0x) == STYP_TEXT)
>
> However, that makes never-seen-before errors to appear when running libgo 
> tests in always-succeeding libgo tests, like: bufio & bytes.
>
>
> Since we have many other GCC Go stuff on AIX to handle, wouldn't it be 
> possible to start with this implementation and to improve it later if it 
> needs to be hardened ?
> Document it as a limitation.

As I wrote in my original reply, it's an incremental start.

".text" may be sufficient.  We'll see.

Thanks, David

RE:[PATCH,AIX] Enable libiberty to read AIX XCOFF

2017-06-07 Thread REIX, Tony

Hi David,

I'll fix the code incorrectly indented.

About your comment about our code looking for TEXT section by looking at string 
".text"  , please note that our patch fixes a file called 
"simple-object-xcoff.c" : SIMPLE.
Do not expect us to handle more than required.

However, are you sure that  -ffunction-sections  is implemented on AIX ? 

Moreover, if it is not implemented on AIX, don't you think that such an option 
which is documented as:
" Place each function or data item into its own section in the output file if 
the
target supports arbitrary sections. The name of the function or the name of
the data item determines the section’s name in the output file.
Use these options on systems where the linker can perform optimizations to
improve locality of reference in the instruction space. Most systems using the
ELF object format and SPARC processors running Solaris 2 have linkers with
such optimizations. AIX may have these optimizations in the future.
Only use these options when there are significant benefits from doing so. When
you specify these options, the assembler and linker create larger object and
executable files and are also slower."
is not compatible with the already existing high complexity of GCC Go 
implementation ?and should be forbidden with Go on AIX ?


We have tried another approach:
127a128
> #define STYP_TEXT 0x20
408a410
>   unsigned int flags;
482a485,486
> flags = fetch_32 (scnhdr + offsetof (struct external_scnhdr,
>  u.xcoff64.s_flags));
489a494,495
> flags = fetch_32 (scnhdr + offsetof (struct external_scnhdr,
>  u.xcoff32.s_flags));
492c498
<   if (strcmp (name, ".text") == 0)
---
>   if ((flags & 0x) == STYP_TEXT)

However, that makes never-seen-before errors to appear when running libgo tests 
in always-succeeding libgo tests, like: bufio & bytes.


Since we have many other GCC Go stuff on AIX to handle, wouldn't it be possible 
to start with this implementation and to improve it later if it needs to be 
hardened ?
Document it as a limitation.


Regards,

Cordialement,

Tony Reix

Bull - ATOS
IBM Coop Architect & Technical Leader
Office : +33 (0) 4 76 29 72 67
1 rue de Provence - 38432 Échirolles - France
www.atos.net


De : David Edelsohn [dje@gmail.com]
Envoyé : mercredi 7 juin 2017 01:25
À : REIX, Tony; Ian Taylor
Cc : SARTER, MATTHIEU (ext); GCC Patches
Objet : Re: [PATCH,AIX] Enable libiberty to read AIX XCOFF

Tony,

This patch generally looks good to me -- it clearly is an incremental
improvement.  One of the libiberty maintainers, such as Ian, needs to
approve the patch.

https://gcc.gnu.org/ml/gcc-patches/2017-05/msg01181.html

+  if (strcmp (name, ".text") == 0)
+textptr = scnptr;

The above code does not seem very robust.  What if the application is
compiled with -ffunction-sections so the text section is not named
".text"?

+  if (strtab == NULL)
+{
+ XDELETEVEC (symtab);
+  XDELETEVEC (scnbuf);
+  return errmsg;

The first XDELETEVEC (symtab) is indented incorrectly and should be fixed.

Thanks, David

Re: [PATCH, rs6000] Fold vector shifts in GIMPLE

2017-06-07 Thread Bill Schmidt


> On Jun 6, 2017, at 11:37 AM, Will Schmidt  wrote:
> 
> On Thu, 2017-06-01 at 10:15 -0500, Bill Schmidt wrote:
>>> On Jun 1, 2017, at 2:48 AM, Richard Biener  
>>> wrote:
>>> 
>>> On Wed, May 31, 2017 at 10:01 PM, Will Schmidt
>>>  wrote:
 Hi,
 
 Add support for early expansion of vector shifts.  Including
 vec_sl (shift left), vec_sr (shift right), vec_sra (shift
 right algebraic), vec_rl (rotate left).
 Part of this includes adding the vector shift right instructions to
 the list of those instructions having an unsigned second argument.
 
 The VSR (vector shift right) folding is a bit more complex than
 the others. This is due to requiring arg0 be unsigned for an algebraic
 shift before the gimple RSHIFT_EXPR assignment is built.
>>> 
>>> Jakub, do we sanitize that undefinedness of left shifts of negative values
>>> and/or overflow of left shift of nonnegative values?
> 
> 
> On Thu, 2017-06-01 at 10:17 +0200, Jakub Jelinek wrote:
>> We don't yet, see PR77823 - all I've managed to do before stage1 was over
>> was instrumentation of signed arithmetic integer overflow on vectors,
>> division, shift etc. are tasks maybe for this stage1.
>> 
>> That said, shift instrumentation in particular is done early because every
>> FE has different rules, and so if it is coming from target builtins that are
>> folded into something, it wouldn't be instrumented anyway. 
> 
> 
> On Thu, 2017-06-01 at 10:15 -0500, Bill Schmidt wrote:
>>> 
>>> Will, how is that defined in the intrinsics operation?  It might need 
>>> similar
>>> treatment as the abs case.
>> 
>> Answering for Will -- vec_sl is defined to simply shift bits off the end to 
>> the
>> left and fill with zeros from the right, regardless of whether the source 
>> type
>> is signed or unsigned.  The result type is signed iff the source type is
>> signed.  So a negative value can become positive as a result of the
>> operation.
>> 
>> The same is true of vec_rl, which will naturally rotate bits regardless of 
>> signedness.
> 
> 
>>> 
>>> [I'd rather make the negative left shift case implementation defined
>>> given C and C++ standards
>>> do not agree to 100% AFAIK]
> 
> With the above answers, how does this one stand?
> 
> [ I have no issue adding the TYPE_OVERFLOW_WRAPS logic to treat some of
> the cases differently, I'm just unclear on whether none/some/all of the
> shifts will require that logic.  :-) ]

I have to defer to Richard here, I don't know the subtleties well enough.

Bill

> 
> thanks,
> -Will
> 
> 
> 
> 
>>> 
>>> Richard.
>>> 
 [gcc]
 
 2017-05-26  Will Schmidt  
 
   * config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add handling
   for early expansion of vector shifts (sl,sr,sra,rl).
   (builtin_function_type): Add vector shift right instructions
   to the unsigned argument list.
 
 [gcc/testsuite]
 
 2017-05-26  Will Schmidt  
 
   * testsuite/gcc.target/powerpc/fold-vec-shift-char.c: New.
   * testsuite/gcc.target/powerpc/fold-vec-shift-int.c: New.
   * testsuite/gcc.target/powerpc/fold-vec-shift-longlong.c: New.
   * testsuite/gcc.target/powerpc/fold-vec-shift-short.c: New.
 
 diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
 index 8adbc06..6ee0bfd 100644
 --- a/gcc/config/rs6000/rs6000.c
 +++ b/gcc/config/rs6000/rs6000.c
 @@ -17408,6 +17408,76 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator 
 *gsi)
   gsi_replace (gsi, g, true);
   return true;
  }
 +/* Flavors of vec_rotate_left . */
 +case ALTIVEC_BUILTIN_VRLB:
 +case ALTIVEC_BUILTIN_VRLH:
 +case ALTIVEC_BUILTIN_VRLW:
 +case P8V_BUILTIN_VRLD:
 +  {
 +   arg0 = gimple_call_arg (stmt, 0);
 +   arg1 = gimple_call_arg (stmt, 1);
 +   lhs = gimple_call_lhs (stmt);
 +   gimple *g = gimple_build_assign (lhs, LROTATE_EXPR, arg0, arg1);
 +   gimple_set_location (g, gimple_location (stmt));
 +   gsi_replace (gsi, g, true);
 +   return true;
 +  }
 +  /* Flavors of vector shift right algebraic.  vec_sra{b,h,w} -> 
 vsra{b,h,w}. */
 +case ALTIVEC_BUILTIN_VSRAB:
 +case ALTIVEC_BUILTIN_VSRAH:
 +case ALTIVEC_BUILTIN_VSRAW:
 +case P8V_BUILTIN_VSRAD:
 +  {
 +   arg0 = gimple_call_arg (stmt, 0);
 +   arg1 = gimple_call_arg (stmt, 1);
 +   lhs = gimple_call_lhs (stmt);
 +   gimple *g = gimple_build_assign (lhs, RSHIFT_EXPR, arg0, arg1);
 +   gimple_set_location (g, gimple_location (stmt));
 +   gsi_replace (gsi, g, true);
 +   return true;
 +  }
 +   /* Flavors of vector shift left.  builtin_altivec_vsl{b,h,w} ->

[C++ PATCH] Bitfield layout

2017-06-07 Thread Nathan Sidwell

There are some exciting rules for laying out a bitfield whose type is 
longer than its underlying type.  Essentially you have to go find the 
longest supported type no longer than the requested size, use that and 
then insert a bunch on inaccessible padding.


The code implementing that rule was more confusing than necessary, 
scanning the list of provided types for one that is too long, and then 
rewinding.  Then seeing if we found a type shorted than requested and 
determining the padding.


This patch changes the search to simply record the most recent 
short-enough type and bail out of the loop when we find one too long. 
Finally, we simply check if the calculated padding is zero bits, and if 
so, don't put an empty padding field in.


Applied to trunk.

nathan
--
Nathan Sidwell
2017-06-07  Nathan Sidwell  

	* class.c (layout_class_type): Restructure overlong-bitfield tpe
	search.

Index: class.c
===
--- class.c	(revision 248970)
+++ class.c	(working copy)
@@ -6426,41 +6426,39 @@ layout_class_type (tree t, tree *virtual
   if (DECL_C_BIT_FIELD (field)
 	  && tree_int_cst_lt (TYPE_SIZE (type), DECL_SIZE (field)))
 	{
-	  unsigned int itk;
-	  tree integer_type;
 	  bool was_unnamed_p = false;
 	  /* We must allocate the bits as if suitably aligned for the
-	 longest integer type that fits in this many bits.  type
-	 of the field.  Then, we are supposed to use the left over
-	 bits as additional padding.  */
-	  for (itk = itk_char; itk != itk_none; ++itk)
-	if (integer_types[itk] != NULL_TREE
-		&& (tree_int_cst_lt (size_int (MAX_FIXED_MODE_SIZE),
- TYPE_SIZE (integer_types[itk]))
-		|| tree_int_cst_lt (DECL_SIZE (field),
-	TYPE_SIZE (integer_types[itk]
-	  break;
+	 longest integer type that fits in this many bits.  Then,
+	 we are supposed to use the left over bits as additional
+	 padding.  */
 
-	  /* ITK now indicates a type that is too large for the
-	 field.  We have to back up by one to find the largest
-	 type that fits.  */
-	  do
-	  {
---itk;
-	integer_type = integer_types[itk];
-	  } while (itk > 0 && integer_type == NULL_TREE);
+	  /* Do not pick a type bigger than MAX_FIXED_MODE_SIZE.  */
+	  tree limit = size_int (MAX_FIXED_MODE_SIZE);
+	  if (tree_int_cst_lt (DECL_SIZE (field), limit))
+	limit = DECL_SIZE (field);
+
+	  tree integer_type = integer_types[itk_char];
+	  for (unsigned itk = itk_char; itk != itk_none; itk++)
+	if (tree next = integer_types[itk])
+	  {
+		if (tree_int_cst_lt (limit, TYPE_SIZE (next)))
+		  /* Too big, so our current guess is what we want.  */
+		  break;
+		/* Not bigger than limit, ok  */
+		integer_type = next;
+	  }
 
 	  /* Figure out how much additional padding is required.  */
-	  if (tree_int_cst_lt (TYPE_SIZE (integer_type), DECL_SIZE (field)))
-	{
-	  if (TREE_CODE (t) == UNION_TYPE)
-		/* In a union, the padding field must have the full width
-		   of the bit-field; all fields start at offset zero.  */
-		padding = DECL_SIZE (field);
-	  else
-		padding = size_binop (MINUS_EXPR, DECL_SIZE (field),
-  TYPE_SIZE (integer_type));
-	}
+	  if (TREE_CODE (t) == UNION_TYPE)
+	/* In a union, the padding field must have the full width
+	   of the bit-field; all fields start at offset zero.  */
+	padding = DECL_SIZE (field);
+	  else
+	padding = size_binop (MINUS_EXPR, DECL_SIZE (field),
+  TYPE_SIZE (integer_type));
+
+ 	  if (integer_zerop (padding))
+	padding = NULL_TREE;
 
 	  /* An unnamed bitfield does not normally affect the
 	 alignment of the containing class on a target where

Re: RFC: [PATCH] Add warn_if_not_aligned attribute

2017-06-07 Thread H.J. Lu

On Tue, Jun 6, 2017 at 5:11 PM, Martin Sebor  wrote:
> On 06/06/2017 04:57 PM, H.J. Lu wrote:
>>
>> On Tue, Jun 6, 2017 at 10:34 AM, Martin Sebor  wrote:
>>>
>>> On 06/06/2017 10:59 AM, H.J. Lu wrote:


 On Tue, Jun 6, 2017 at 9:10 AM, Martin Sebor  wrote:
>
>
> On 06/06/2017 10:07 AM, Martin Sebor wrote:
>>
>>
>>
>> On 06/05/2017 11:45 AM, H.J. Lu wrote:
>>>
>>>
>>>
>>> On Mon, Jun 5, 2017 at 8:11 AM, Joseph Myers
>>> 
>>> wrote:



 The new attribute needs documentation.  Should the test be in
 c-c++-common
>>>
>>>
>>>
>>>
>>> This feature does support C++.  But C++ compiler issues a slightly
>>> different warning at a different location.
>>>
 or does this feature not support C++?

>>>
>>> Here is the updated patch with documentation and a C++ test.  This
>>> patch caused a few testsuite failures:
>>>
>>> FAIL: gcc.dg/compat/struct-align-1 c_compat_x_tst.o compile
>>>
>>>
>>>
>>>
>>> /export/gnu/import/git/sources/gcc/gcc/testsuite/gcc.dg/compat//struct-align-1.h:169:1:
>>>
>>> warning: alignment 1 of 'struct B2_m_inner_p_outer' is less than 16
>>>
>>> FAIL: g++.dg/torture/pr80334.C   -O0  (test for excess errors)
>>>
>>>
>>>
>>>
>>> /export/gnu/import/git/sources/gcc/gcc/testsuite/g++.dg/torture/pr80334.C:4:8:
>>>
>>> warning: alignment 1 of 'B' is less than 16
>>>
>>
>> Users often want the ability to control a warning, even when it
>> certainly indicates a bug.  I would suggest to add an option to
>> make it possible for this warning as well.
>>
>> Btw., a bug related to some of those this warning is meant to
>> detect is assigning the address of an underaligned object to
>> a pointer of a natively aligned type.  Clang has an option
>> to detect this problem: -Waddress-of-packed-member.  It might
>> make a nice follow-on enhancement to add support for the same
>> thing.  I mention this because I think it would make sense to
>> consider this when choosing the name of the GCC option (i.e.,
>> rather than having two distinct but closely related warnings,
>> have one that detects both of these alignment type of bugs.
>
>
>
>
> A bug that has some additional context on this is pr 51628.
> A possible name for the new option suggested there is -Wpacked.
>
> Martin



 Isn't -Waddress-of-packed-member a subset of or the same as
 -Wpacked?
>>>
>>>
>>>
>>> In Clang it's neither.  -Waddress-of-packed-member only triggers
>>> when the address of a packed member is taken but not for the cases
>>> in bug 53037 (i.e., reducing the alignment of a member).  It's
>>> also enabled by default, while -Wpacked needs to be specified
>>> explicitly (i.e., it's in neither -Wall or -Wextra).
>>>
>>> FWIW, I don't really have a strong opinion about the names of
>>> the options.  My input is that the proliferation of fine-grained
>>> warning options for closely related problems tends to make it
>>> tricky to get their interactions right (both in the compiler
>>> and for users).  Enabling both/all such options can lead to
>>> multiple warnings for what boils down to essentially the same
>>> bug in the same expression, overwhelming the user in repetitive
>>> diagnostics.
>>>
>>
>> There is already -Wpacked.  Should I overload it for this?
>
>
> I'd say yes if -Wpacked were at least in -Wall.  But it's
> an opt-in kind of warning that's not even in -Wextra, and
> relaxing an explicitly specified alignment seems more like
> a bug than just something that might be nice to know about.
> I would expect warn_if_not_aligned to trigger a warning even
> without -Wall (i.e., as you have it in your patch, but with
> an option to control it).  That would suggest three levels
> of warnings:
>
> 1) warn by default (warn_if_not_aligned violation)
> 2) warn with -Wall (using a type with attribute aligned to
>define a member of a packed struct)
> 3) warn if requested (current -Wpacked)
>
> So one way to deal with it would be to change -Wpacked to
> take an argument between 0 and 3, set the default to
> correspond to the (1) above, and have -Wall bump it up to
> (2).
>
> If the equivalent of -Waddress-of-packed-member were to be
> implemented in GCC it would probably be a candidate to add
> to the (2) above.(*)
>
> This might be more involved than you envisioned.  A slightly
> simpler alternative would be to add a different option, say
> something like -Walign=N, and have it handle just (1) and
> (2) above, leaving -Wpacked unchanged.
>

Since there is no agreement on -W options and changes
may touch many places, I will do

1. Add -Wwarn-if-not-aligned and enable it by default.
2. Add -Wpacked-not-aligned and disable it by

RE:[PATCH,AIX] Enable libiberty to read AIX XCOFF

2017-06-07 Thread REIX, Tony

Hi DJ

A) XNEWVEC

1) ./include/libiberty.h:

It appears that XNEWVEC() calls xmalloc which prints a message and calls xexit 
if malloc fails.

#define XNEWVEC(T, N) ((T *) xmalloc (sizeof (T) * (N)))

/* Allocate memory without fail.  If malloc fails, this will print a
   message to stderr (using the name set by xmalloc_set_program_name,
   if any) and then call xexit.  */
extern void *xmalloc (size_t) ATTRIBUTE_MALLOC ATTRIBUTE_RETURNS_NONNULL;

2)  ./libiberty/simple-object-xcoff.c :

It appears that  XNEWVEC() was already called 2 times before we added a third 
use of it, and still with NO check of return.

simple_object_xcoff_read_strtab (...)
{
...
  strtab = XNEWVEC (char, strsize);
  if (!simple_object_internal_read (sobj->descriptor, strtab_offset,
(unsigned char *) strtab, strsize, errmsg,
err))
...

simple_object_xcoff_find_sections (...)
{
 ...
  scnbuf = XNEWVEC (unsigned char, scnhdr_size * ocr->nscns);
  if (!simple_object_internal_read (sobj->descriptor,
sobj->offset + ocr->scnhdr_offset,
scnbuf, scnhdr_size * ocr->nscns, ,
err))

Thus, I think that we should continue to do what we did and do NOT check the 
return of XNEWVEC() .

B) XDELETEVEC

1) ./include/libiberty.h:

#define XDELETEVEC(P) free ((void*) (P))

2) free() documentation : The free subroutine deallocates a  ... If the Pointer 
parameter is NULL, no action occurs.

So, yes, we check   if (strtab == NULL)   though there is no way that 
XDELETEVEC(NULL) breaks something.
However, it is a classic programming style.

And the same programming style was used before we added our patch in 
simple_object_xcoff_find_sections () :
  /* The real section name is found in the string
 table.  */
  if (strtab == NULL)
{
  strtab = simple_object_xcoff_read_strtab (sobj,
   _size,
   , err);
  if (strtab == NULL)
{
  XDELETEVEC (scnbuf);
  return errmsg;
}
}

So our new code seems coherent with previous existing code.

Regards,

Cordialement,

Tony Reix

Bull - ATOS
IBM Coop Architect & Technical Leader
Office : +33 (0) 4 76 29 72 67
1 rue de Provence - 38432 Échirolles - France
www.atos.net

De : DJ Delorie [d...@redhat.com]
Envoyé : mercredi 7 juin 2017 01:52
À : David Edelsohn
Cc : REIX, Tony; i...@golang.org; SARTER, MATTHIEU (ext); 
gcc-patches@gcc.gnu.org
Objet : Re: [PATCH,AIX] Enable libiberty to read AIX XCOFF

David Edelsohn  writes:
> This patch generally looks good to me -- it clearly is an incremental
> improvement.  One of the libiberty maintainers, such as Ian, needs to
> approve the patch.

As AIX maintainer, I think you have the authority to approve patches
like this, which only affect your OS.  I see no reason to reject the
patch myself, other than:

+  symtab = XNEWVEC (struct external_syment, ocr->nsyms * SYMESZ);
+  if (!simple_object_internal_read (sobj->descriptor,

There's no check to see if XNEWVEC succeeded.

Also, the use of XDELETEVEC is inconsistently protected with a "if (foo
!= NULL)" throughout, but passing NULL to XDELETEVEC (essentially,
free()) is allowed anyway, so this is only a stylistic issue, which I'm
not particularly worried about.

[PATCH] Add C++17 deduction guide for std::basic_string (P0433R2, partial)

2017-06-07 Thread Jonathan Wakely


This adds the deduction guide for std::basic_string. The standard says
the guide needs to be constrained to only match types that meet (an
unspecified subset of) the requirements of InputIterators and
Allocators. I've added a new __is_allocator trait for that, which
checks for a nested value_type and checks for an allocate member
function that can be called with a size_t. It's worth noting that
std::allocator does *not* meet that requirement, because it has
no allocate member. (In the terminology of the Networking TS
allocator is a PseudoAllocator, meaning it can be rebound to
obtain an Allocator, but isn't necessarily an Allocator itself.)

I also needed to modify __alloc_traits, so it can only be instantiated
for types that might be allocators (or pseudo-allocators). This
prevents substitution errors outside the immediate context when
argument deduction performs overload resolution on basic_string
constructors, and tries to refer to invalid types such as
__alloc_traits::size_type. For a demonstration of this problem,
consider:

template
struct allocator_traits {
 using size_type = typename A::size_type;
};

struct allocator { using size_type = unsigned; };

template
struct container
{
 using size_type = typename allocator_traits::size_type;

 container(const container&, unsigned, unsigned) { }

 container(size_type, T, const A& = A()) { }
};

int main()
{
 container c(1, '2');
 container d(c, 0, 0);
}

x.cc: In instantiation of 'struct allocator_traits':
x.cc:15:3:   required by substitution of 'template container(container::size_type, T, const A&)-> container [with T = int; A = int]'
x.cc:22:22:   required from here
x.cc:3:42: error: 'int' is not a class, struct, or union type
  using size_type = typename A::size_type;
 ^

Changing __alloc_traits avoids this, because the substitution failures
happen in the immediate context. The change to __alloc_traits
shouldn't affect any mangled symbols, because that type is only used
internally.

* include/bits/alloc_traits.h (__is_allocator, _RequireAllocator):
New trait and alias for detecting Allocator-like types.
* include/bits/basic_string.h (basic_string): Add deduction guide
from P0433.
* include/ext/alloc_traits.h (__gnu_cxx::__alloc_traits): Add template
parameter with default template argument that causes substitution
failures for types that cannot be allocators.
* testsuite/21_strings/basic_string/cons/char/deduction.cc: New.
* testsuite/21_strings/basic_string/cons/wchar_t/deduction.cc: New.

Tested powerpc64le-linux, committed to trunk.

commit c53655d1663e0c7edfffaae14bc47a04cbc2da0f
Author: Jonathan Wakely 
Date:   Wed Jun 7 13:18:42 2017 +0100

Add C++17 deduction guide for std::basic_string (P0433R2, partial)

* include/bits/alloc_traits.h (__is_allocator, _RequireAllocator):
New trait and alias for detecting Allocator-like types.
* include/bits/basic_string.h (basic_string): Add deduction guide
from P0433.
* include/ext/alloc_traits.h (__gnu_cxx::__alloc_traits): Add template
parameter with default template argument that causes substitution
failures for types that cannot be allocators.
* testsuite/21_strings/basic_string/cons/char/deduction.cc: New.
* testsuite/21_strings/basic_string/cons/wchar_t/deduction.cc: New.

diff --git a/libstdc++-v3/include/bits/alloc_traits.h 
b/libstdc++-v3/include/bits/alloc_traits.h
index 4d1e489..86a4859 100644
--- a/libstdc++-v3/include/bits/alloc_traits.h
+++ b/libstdc++-v3/include/bits/alloc_traits.h
@@ -598,6 +598,22 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 : is_copy_constructible<_Tp>
 { };
 
+#if __cplusplus >= 201103L
+  // Trait to detect Allocator-like types.
+  template
+struct __is_allocator : false_type { };
+
+  template
+struct __is_allocator<_Alloc,
+  __void_t().allocate(size_t{}))>>
+: true_type { };
+
+  template
+using _RequireAllocator
+  = typename enable_if<__is_allocator<_Alloc>::value, _Alloc>::type;
+#endif
+
 _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace std
 
diff --git a/libstdc++-v3/include/bits/basic_string.h 
b/libstdc++-v3/include/bits/basic_string.h
index b6693c4..519d686 100644
--- a/libstdc++-v3/include/bits/basic_string.h
+++ b/libstdc++-v3/include/bits/basic_string.h
@@ -5674,6 +5674,18 @@ _GLIBCXX_END_NAMESPACE_CXX11
   };
 #endif  // !_GLIBCXX_USE_CXX11_ABI
 
+#if __cpp_deduction_guides >= 201606
+_GLIBCXX_BEGIN_NAMESPACE_CXX11
+  template::value_type,
+  typename _Allocator = allocator<_CharT>,
+  typename = _RequireInputIter<_InputIterator>,
+  typename = _RequireAllocator<_Allocator>>
+basic_string(_InputIterator, _InputIterator, _Allocator = _Allocator())
+  -> basic_string<_CharT, char_traits<_CharT>, _Allocator>;
+_GLIBCXX_END_NAMESPACE_CXX11
+#endif
+
   // operator+
   /**
*  @brief

Re: MinGW compilation warnings in libiberty's waitpid.c

2017-06-07 Thread Joel Brobecker

> I ended up not having time before going on holiday.  If the resync
> hasn't already been done, I'll do it now.

Thanks for doing that, Iain!

-- 
Joel

[PATCH v2][PING] Add no_tail_call attribute

2017-06-07 Thread Yuri Gribov

On Mon, May 29, 2017 at 8:14 AM, Yuri Gribov  wrote:
> Hi all,
>
> As discussed in
> https://sourceware.org/ml/libc-alpha/2017-01/msg00455.html , some
> libdl functions rely on return address to figure out the calling
> DSO and then use this information in computation (e.g. output of dlsym
> depends on which library called it).
>
> As reported in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66826 this
> may break under tailcall optimization i.e. in cases like
>
>   return dlsym(...);
>
> Carlos confirmed that they would prefer to have GCC attribute to
> prevent tailcalls
> (https://sourceware.org/ml/libc-alpha/2017-01/msg00502.html) so there
> you go.
>
> This was bootstrapped on x86_64. Given that this is a minor addition,
> I only ran newly added regtests. I hope that's enough (full testsuite
> would take a week on my notebook...).

Added docs, per Alex's suggestion.

-Y


0001-Added-no_tail_call-attribute.patch
Description: Binary data

Re: [PATCH 0/5 v3] Vect peeling cost model

2017-06-07 Thread Andreas Schwab

On Jun 07 2017, Robin Dapp  wrote:

>> http://gcc.gnu.org/ml/gcc-testresults/2017-06/msg00297.html
>
> What machine is this running on?

On a G5.

Andreas.

-- 
Andreas Schwab, sch...@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

[PATCH][GCC][AArch64] optimize float immediate moves (2 /4) - HF/DF/SF mode.

2017-06-07 Thread Tamar Christina

Hi All, 


This patch adds support for creating floating point constants
using mov immediate instructions.  The movi SIMD instruction can
be used for HFmode and SFmode constants, eg. for -0.0f we generate:

movi v0.2s, 0x80, lsl 24

More complex constants can be generated using an integer MOV or
MOV+MOVK:

 mov   w0, 48128
 movk  w0, 0x47f0, lsl 16
 fmov  s0, w0

We allow up to 3 instructions as this allows all HF, SF and most DF
constants to be generated without a literal load, and is overall best
for codesize.


Regression tested on aarch64-none-linux-gnu and no regressions.

OK for trunk?

Thanks,
Tamar


gcc/
2017-06-07  Tamar Christina  

* config/aarch64/aarch64.md (mov): Generalize.
(*movhf_aarch64, *movsf_aarch64, *movdf_aarch64):
Add integer and movi cases.diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 5adc5edb8dde9c30450b04932a37c41f84cc5ed1..7f107672882b13809be01355ffafbc2807cc5adb 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1167,66 +1167,120 @@
   }
 )
 
-(define_insn "*movhf_aarch64"
-  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  ,?r,w,w,m,r,m ,r")
-	(match_operand:HF 1 "general_operand"  "Y ,?rY, w,w,m,w,m,rY,r"))]
+(define_insn_and_split "*movhf_aarch64"
+  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w  ,w,m,r,m ,r")
+	(match_operand:HF 1 "general_operand"  "Y ,?rY, w,w,Ufc,Uvi,m,w,m,rY,r"))]
   "TARGET_FLOAT && (register_operand (operands[0], HFmode)
-|| aarch64_reg_or_fp_zero (operands[1], HFmode))"
+|| aarch64_reg_or_fp_float (operands[1], HFmode))"
   "@
movi\\t%0.4h, #0
-   mov\\t%0.h[0], %w1
+   fmov\\t%s0, %w1
umov\\t%w0, %1.h[0]
mov\\t%0.h[0], %1.h[0]
+   fmov\\t%s0, %1
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);
ldr\\t%h0, %1
str\\t%h1, %0
ldrh\\t%w0, %1
strh\\t%w1, %0
mov\\t%w0, %w1"
-  [(set_attr "type" "neon_move,neon_from_gp,neon_to_gp,neon_move,\
- f_loads,f_stores,load1,store1,mov_reg")
-   (set_attr "simd" "yes,yes,yes,yes,*,*,*,*,*")]
+  "&& can_create_pseudo_p ()
+   && !aarch64_can_const_movi_rtx_p (operands[1], HFmode)
+   && !aarch64_float_const_representable_p (operands[1])
+   &&  aarch64_float_const_rtx_p (operands[1])"
+  [(const_int 0)]
+  "{
+unsigned HOST_WIDE_INT ival;
+if (!aarch64_reinterpret_float_as_int (operands[1], ))
+  FAIL;
+
+rtx tmp = gen_reg_rtx (SImode);
+aarch64_expand_mov_immediate (tmp, GEN_INT (ival));
+tmp = simplify_gen_subreg (HImode, tmp, SImode, 0);
+emit_move_insn (operands[0], gen_lowpart (HFmode, tmp));
+DONE;
+  }"
+  [(set_attr "type" "neon_move,f_mcr,neon_to_gp,neon_move,fconsts, \
+		 neon_move,f_loads,f_stores,load1,store1,mov_reg")
+   (set_attr "simd" "yes,*,yes,yes,*,yes,*,*,*,*,*")]
 )
 
-(define_insn "*movsf_aarch64"
-  [(set (match_operand:SF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w,m,r,m ,r")
-	(match_operand:SF 1 "general_operand"  "Y ,?rY, w,w,Ufc,m,w,m,rY,r"))]
+(define_insn_and_split "*movsf_aarch64"
+  [(set (match_operand:SF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")
+	(match_operand:SF 1 "general_operand"  "Y ,?rY, w,w,Ufc,Uvi,m,w,m,rY,r,M"))]
   "TARGET_FLOAT && (register_operand (operands[0], SFmode)
-|| aarch64_reg_or_fp_zero (operands[1], SFmode))"
+|| aarch64_reg_or_fp_float (operands[1], SFmode))"
   "@
movi\\t%0.2s, #0
fmov\\t%s0, %w1
fmov\\t%w0, %s1
fmov\\t%s0, %s1
fmov\\t%s0, %1
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);
ldr\\t%s0, %1
str\\t%s1, %0
ldr\\t%w0, %1
str\\t%w1, %0
-   mov\\t%w0, %w1"
-  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconsts,\
- f_loads,f_stores,load1,store1,mov_reg")
-   (set_attr "simd" "yes,*,*,*,*,*,*,*,*,*")]
+   mov\\t%w0, %w1
+   mov\\t%w0, %1"
+  "&& can_create_pseudo_p ()
+   && !aarch64_can_const_movi_rtx_p (operands[1], SFmode)
+   && !aarch64_float_const_representable_p (operands[1])
+   &&  aarch64_float_const_rtx_p (operands[1])"
+  [(const_int 0)]
+  "{
+unsigned HOST_WIDE_INT ival;
+if (!aarch64_reinterpret_float_as_int (operands[1], ))
+  FAIL;
+
+rtx tmp = gen_reg_rtx (SImode);
+aarch64_expand_mov_immediate (tmp, GEN_INT (ival));
+emit_move_insn (operands[0], gen_lowpart (SFmode, tmp));
+DONE;
+  }"
+  [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconsts,neon_move,\
+		 f_loads,f_stores,load1,store1,mov_reg,\
+		 fconsts")
+   (set_attr "simd" "yes,*,*,*,*,yes,*,*,*,*,*,*")]
 )
 
-(define_insn "*movdf_aarch64"
-  [(set (match_operand:DF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w,m,r,m ,r")
-	(match_operand:DF 1 "general_operand"  "Y ,?rY, w,w,Ufc,m,w,m,rY,r"))]
+(define_insn_and_split "*movdf_aarch64"
+  [(set (match_operand:DF 0 "nonimmediate_operand" "=w, w  ,?r,w,w  ,w  ,w,m,r,m ,r,r")

[PATCH][GCC][AArch64] optimize float immediate moves (3 /4) - testsuite.

2017-06-07 Thread Tamar Christina

Hi All, 


This patch adds new tests to cover the newly generated code from this patch 
series.


Regression tested on aarch64-none-linux-gnu and no regressions.

OK for trunk?

Thanks,
Tamar

gcc/testsuite/
2017-06-07  Tamar Christina  
Bilyan Borisov  

* gcc.target/aarch64/dbl_mov_immediate_1.c: New.
* gcc.target/aarch64/flt_mov_immediate_1.c: New.
* gcc.target/aarch64/f16_mov_immediate_1.c: New.
* gcc.target/aarch64/f16_mov_immediate_2.c: New.diff --git a/gcc/testsuite/gcc.target/aarch64/dbl_mov_immediate_1.c b/gcc/testsuite/gcc.target/aarch64/dbl_mov_immediate_1.c
new file mode 100644
index ..eb5b23b8f842c1f299bd58c8f944dce6234c111b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/dbl_mov_immediate_1.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+double d0(void)
+{
+  double x = 0.0d;
+  return x;
+}
+
+double dn1(void)
+{
+  double x = -0.0d;
+  return x;
+}
+
+
+double d1(void)
+{
+  double x = 1.5d;
+  return x;
+}
+
+double d2(void)
+{
+  double x = 123256.0d;
+  return x;
+}
+
+double d3(void)
+{
+  double x = 123256123456.0d;
+  return x;
+}
+
+double d4(void)
+{
+  double x = 123456123456123456.0d;
+  return x;
+}
+
+/* { dg-final { scan-assembler-times "movi\td\[0-9\]+, ?#0" 1 } } */
+
+/* { dg-final { scan-assembler-times "adrp\tx\[0-9\]+, \.LC\[0-9\]" 2 } } */
+/* { dg-final { scan-assembler-times "ldr\td\[0-9\]+, \\\[x\[0-9\], #:lo12:\.LC\[0-9\]\\\]" 2 } } */
+
+/* { dg-final { scan-assembler-times "fmov\td\[0-9\]+, 1\\\.5e\\\+0"1 } } */
+
+/* { dg-final { scan-assembler-times "mov\tx\[0-9\]+, 25838523252736"   1 } } */
+/* { dg-final { scan-assembler-times "movk\tx\[0-9\]+, 0x40fe, lsl 48"  1 } } */
+/* { dg-final { scan-assembler-times "mov\tx\[0-9\]+, -9223372036854775808" 1 } } */
+/* { dg-final { scan-assembler-times "fmov\td\[0-9\]+, x\[0-9\]+"   2 } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/f16_mov_immediate_1.c b/gcc/testsuite/gcc.target/aarch64/f16_mov_immediate_1.c
new file mode 100644
index ..7851dfca79487db28bd8bc25c268d93d14fa12b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/f16_mov_immediate_1.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+extern __fp16 foo ();
+extern void bar (__fp16* x);
+
+void f1 ()
+{
+  volatile __fp16 a = 17.0;
+}
+
+
+void f2 (__fp16 *a)
+{
+  *a = 17.0;
+}
+
+void f3 ()
+{
+  __fp16 b = foo ();
+  b = 17.0;
+  bar ();
+}
+
+__fp16 f4 ()
+{
+  __fp16 a = 0;
+  __fp16 b = 1;
+  __fp16 c = 2;
+  __fp16 d = 4;
+
+  __fp16 z = a + b;
+  z = z + c;
+  z = z - d;
+  return z;
+}
+
+__fp16 f5 ()
+{
+  __fp16 a = 16;
+  bar ();
+  return a;
+}
+
+/* { dg-final { scan-assembler-times "mov\tw\[0-9\]+, #?19520"   3 } } */
+/* { dg-final { scan-assembler-times "movi\tv\[0-9\]+\\\.2s, 0xbc, lsl 8"  1 } } */
+/* { dg-final { scan-assembler-times "movi\tv\[0-9\]+\\\.2s, 0x4c, lsl 8"  1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/f16_mov_immediate_2.c b/gcc/testsuite/gcc.target/aarch64/f16_mov_immediate_2.c
new file mode 100644
index ..a7ff50ae5a712d6d4057aa6581cfd28fe9f511ab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/f16_mov_immediate_2.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include 
+
+float16_t f0(void)
+{
+  float16_t x = 0.0f;
+  return x;
+}
+
+float16_t fn1(void)
+{
+  float16_t x = -0.0f;
+  return x;
+}
+
+float16_t f1(void)
+{
+  float16_t x = 256.0f;
+  return x;
+}
+
+float16_t f2(void)
+{
+  float16_t x = 123256.0f;
+  return x;
+}
+
+float16_t f3(void)
+{
+  float16_t x = 17.0;
+  return x;
+}
+
+/* { dg-final { scan-assembler-times "movi\tv\[0-9\]+\\\.4h, ?#0" 1 } } */
+/* { dg-final { scan-assembler-times "movi\tv\[0-9\]+\\\.2s, 0x80, lsl 8" 1 } } */
+/* { dg-final { scan-assembler-times "movi\tv\[0-9\]+\\\.2s, 0x5c, lsl 8" 1 } } */
+/* { dg-final { scan-assembler-times "movi\tv\[0-9\]+\\\.2s, 0x7c, lsl 8" 1 } } */
+
+/* { dg-final { scan-assembler-times "mov\tw\[0-9\]+, 19520"  1 } } */
+/* { dg-final { scan-assembler-times "fmov\ts\[0-9\], w\[0-9\]+"  1 } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/flt_mov_immediate_1.c b/gcc/testsuite/gcc.target/aarch64/flt_mov_immediate_1.c
new file mode 100644
index ..7b92a5ae40fbd042a6b564a557118d8f8eac7abd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/flt_mov_immediate_1.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+float f0(void)
+{
+  float x = 0.0f;
+  return x;
+}
+
+float fn1(void)
+{
+  float x = -0.0f;
+  return x;
+}
+
+float f1(void)
+{
+  float x = 256.0f;
+  return x;
+}
+
+float f2(void)
+{
+  float x = 123256.0f;
+  return x;
+}
+
+float f3(void)
+{
+  float x = 2.0f;
+  return x;
+}
+

[PATCH][GCC][AArch64] Inline calls to lrint when possible

2017-06-07 Thread Tamar Christina

Hi All,

This patch allows the inlining of lrint when -fno-math-errno
assuming that errno does not need to be set when the rounded value
is not representable as a long.

The case

void f(double *a, long *b, double x)
{
*a = __builtin_rint(x);
*b = __builtin_lrint(x);
}

now generates with -fno-math-errno:

f:
frintx  d0, d0
fcvtzs  x2, d0
str d0, [x0]
str x2, [x1]
ret

When the flag is not used the same function call is emitted as before:

f:
stp x29, x30, [sp, -32]!
frintx  d1, d0
add x29, sp, 0
str x19, [sp, 16]
mov x19, x1
str d1, [x0]
bl  lrint
str x0, [x19]
ldr x19, [sp, 16]
ldp x29, x30, [sp], 32
ret

Bootstrapped and regtested on aarch64-none-linux-gnu and no regressions.
The patch also has no regressions on Spec2006.

Ok for trunk?

gcc/
2017-06-07  Tamar Christina  

* config/aarch64/aarch64.md (lrint2): New.

gcc/testsuite/
2017-06-07  Tamar Christina  

* gcc.target/aarch64/lrint-matherr.h: New.
* gcc.target/aarch64/inline-lrint_1.c: New.
* gcc.target/aarch64/inline-lrint_2.c: New.
* gcc.target/aarch64/no-inline-lrint_1.c: New.
* gcc.target/aarch64/no-inline-lrint_2.c: New.diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 5adc5edb8dde9c30450b04932a37c41f84cc5ed1..c65159085e342f7611104b2890de99fc02e6fb8e 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4997,6 +4997,18 @@
   [(set_attr "type" "f_minmax")]
 )
 
+(define_expand "lrint2"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPF 1 "register_operand")]
+  "TARGET_FLOAT"
+{
+  rtx cvt = gen_reg_rtx (mode);
+  emit_insn (gen_rint2 (cvt, operands[1]));
+  emit_insn (gen_lbtrunc2 (operands[0], cvt));
+  DONE;
+}
+)
+
 ;; For copysign (x, y), we want to generate:
 ;;
 ;;   LDR d2, #(1 << 63)
diff --git a/gcc/testsuite/gcc.target/aarch64/inline-lrint_1.c b/gcc/testsuite/gcc.target/aarch64/inline-lrint_1.c
new file mode 100644
index ..876cecd674b7cb35bc18d5cd3aa5587813e53dd9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/inline-lrint_1.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O3 -fno-math-errno" } */
+
+#include "lrint-matherr.h"
+
+TEST (dld, double, long, )
+TEST (flf, float , long, )
+
+TEST (did, double, int, )
+TEST (fif, float , int, )
+
+TEST (dlld, double, long long, l)
+TEST (fllf, float , long long, l)
+
+/* { dg-final { scan-assembler-times "frintx\td\[0-9\]+, d\[0-9\]+" 3 } } */
+/* { dg-final { scan-assembler-times "frintx\ts\[0-9\]+, s\[0-9\]+" 3 } } */
+/* { dg-final { scan-assembler-times "fcvtzs\tx\[0-9\]+, d\[0-9\]+" 3 } } */
+/* { dg-final { scan-assembler-times "fcvtzs\tx\[0-9\]+, s\[0-9\]+" 3 } } */
+/* { dg-final { scan-assembler-not "bl"} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/inline-lrint_2.c b/gcc/testsuite/gcc.target/aarch64/inline-lrint_2.c
new file mode 100644
index ..baa5aee761788e2b83f8f9283bb0aa7d79aad348
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/inline-lrint_2.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target ilp32 } */
+/* { dg-options "-O3 -fno-math-errno" } */
+
+#include "lrint-matherr.h"
+
+TEST (dld, double, long, )
+TEST (flf, float , long, )
+
+TEST (did, double, int, )
+TEST (fif, float , int, )
+
+TEST (dlld, double, long long, l)
+TEST (fllf, float , long long, l)
+
+/* { dg-final { scan-assembler-times "frintx\td\[0-9\]+, d\[0-9\]+" 3 } } */
+/* { dg-final { scan-assembler-times "frintx\ts\[0-9\]+, s\[0-9\]+" 3 } } */
+/* { dg-final { scan-assembler-times "fcvtzs\tx\[0-9\]+, d\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "fcvtzs\tx\[0-9\]+, s\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "fcvtzs\tw\[0-9\]+, d\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler-times "fcvtzs\tw\[0-9\]+, s\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler-not "bl"} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/lrint-matherr.h b/gcc/testsuite/gcc.target/aarch64/lrint-matherr.h
new file mode 100644
index ..cc6e3d13f9bd47a316cc56a07917f4b5de185236
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/lrint-matherr.h
@@ -0,0 +1,5 @@
+#define TEST(name, float_type, int_type, pref) void f_##name (float_type x) \
+{	\
+  volatile float_type a = __builtin_rint (x);\
+  volatile int_type   b = __builtin_l##pref##rint (x);			\
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/no-inline-lrint_1.c b/gcc/testsuite/gcc.target/aarch64/no-inline-lrint_1.c
new file mode 100644
index ..fb7f0655687568e9d6783acf88ef56b54a73c2c5
--- /dev/null
+++

[PATCH][GCC][AArch64] optimize float immediate moves (1 /4) - infrastructure.

2017-06-07 Thread Tamar Christina

Hi All, 


This patch lays the ground work to fix the immediate moves for floats
to use a combination of mov, movi, fmov instead of ldr and adrp to load
float constants that fit within the 16-bit limit of movz.

The idea behind it is that these are used quite often in masks etc and we can
get a gain by doing integer moves instead of memory loads.

This patch also adds the patterns for SImode and DImode to use SIMD mov
instructions when it's able to.

It's particularly handy when masks are used such as the
0x8000 mask in copysignf.

This now generates

moviv2.2s, 0x80, lsl 24

instead of a literal load.


Regression tested on aarch64-none-linux-gnu and no regressions.

OK for trunk?

Thanks,
Tamar


gcc/
2017-06-07  Tamar Christina  

* config/aarch64/aarch64.c
(aarch64_simd_container_mode): Add prototype.
(aarch64_expand_mov_immediate): Add HI support.
(aarch64_reinterpret_float_as_int, aarch64_float_const_rtx_p: New.
(aarch64_can_const_movi_rtx_p): New.
(aarch64_preferred_reload_class):
Remove restrictions of using FP registers for certain SIMD operations.
(aarch64_rtx_costs): Added new cost for CONST_DOUBLE moves.
(aarch64_valid_floating_const): Add integer move validation.
(aarch64_simd_imm_scalar_p): Remove.
(aarch64_output_scalar_simd_mov_immediate): Generalize function.
(aarch64_legitimate_constant_p): Expand list of supported cases.
* config/aarch64/aarch64-protos.h
(aarch64_float_const_rtx_p, aarch64_can_const_movi_rtx_p): New.
(aarch64_reinterpret_float_as_int): New.
(aarch64_simd_imm_scalar_p): Remove.
* config/aarch64/predicates.md (aarch64_reg_or_fp_float): New.
* config/aarch64/constraints.md (Uvi): New.
(Dd): Split into Ds and new Dd.
* config/aarch64/aarch64.md (*movsi_aarch64):
Add SIMD mov case.
(*movdi_aarch64): Add SIMD mov case.diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 9543f8c9f2974ad7f8612aa007f975dd6eeec2bc..5a05137e4e28be09a1516ad6bbfce2661052195e 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -313,6 +313,8 @@ bool aarch64_emit_approx_div (rtx, rtx, rtx);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 bool aarch64_expand_movmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
+bool aarch64_float_const_rtx_p (rtx);
+bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);
 bool aarch64_function_arg_regno_p (unsigned);
 bool aarch64_fusion_enabled_p (enum aarch64_fusion_pairs);
 bool aarch64_gen_movmemqi (rtx *);
@@ -340,7 +342,6 @@ bool aarch64_regno_ok_for_base_p (int, bool);
 bool aarch64_regno_ok_for_index_p (int, bool);
 bool aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
 	bool high);
-bool aarch64_simd_imm_scalar_p (rtx x, machine_mode mode);
 bool aarch64_simd_imm_zero_p (rtx, machine_mode);
 bool aarch64_simd_scalar_immediate_valid_for_move (rtx, machine_mode);
 bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
@@ -475,4 +476,6 @@ std::string aarch64_get_extension_string_for_isa_flags (unsigned long,
 
 rtl_opt_pass *make_pass_fma_steering (gcc::context *ctxt);
 
+bool aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *fail);
+
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index a069427f576f6bd7336bbe4497249773bd33d138..a99a13460c2314ca9b40f82a466b6d492c49db97 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -147,6 +147,8 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 			 const_tree type,
 			 int misalignment,
 			 bool is_packed);
+static machine_mode
+aarch64_simd_container_mode (machine_mode mode, unsigned width);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -4613,6 +4615,66 @@ aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
   return true;
 }
 
+/* Return the binary representation of floating point constant VALUE in INTVAL.
+   If the value cannot be converted, return false without setting INTVAL.
+   The conversion is done in the given MODE.  */
+bool
+aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
+{
+  machine_mode mode = GET_MODE (value);
+  if (GET_CODE (value) != CONST_DOUBLE
+  || !SCALAR_FLOAT_MODE_P (mode)
+  || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT)
+return false;
+
+  unsigned HOST_WIDE_INT ival = 0;
+
+  /* Only support up to DF mode.  */
+  gcc_assert (GET_MODE_BITSIZE (mode) <= 64);
+  int needed = GET_MODE_BITSIZE (mode) == 64 ? 2 : 1;
+
+  long res[2];
+  real_to_target (res,
+		  CONST_DOUBLE_REAL_VALUE (value),
+		  REAL_MODE_FORMAT (mode));
+
+  ival = zext_hwi (res[needed - 1],

[PATCH][GCC][AArch64] optimize integer immediate moves with partial masks.

2017-06-07 Thread Tamar Christina

Hi All, 

This patch optimizes integer moves for cases where where the move could be done
move efficiently using a smaller mode.

For example:

long long f1(void)
{
  return 0x;
}

long f2(void)
{
  return 0x;
}

generates:


f1:
mov w0, 4294927974
ret

f2:
mov w0, 4294927974
movkx0, 0x, lsl 48
ret

instead of:

f1:
mov x0, 26214
movkx0, 0x, lsl 16
ret

f2:
mov x0, 26214
movkx0, 0x, lsl 16
movkx0, 0x, lsl 48

This works when the low 32 bits are either 0x or 0x (with  
non-zero), 
a 32-bit MOVN instruction can be used as if the type was int rather than long 
(due to zero-extend to 64 bits).

Regression tested on aarch64-none-linux-gnu and no regressions.

OK for trunk?

Thanks,
Tamar


gcc/
2017-06-07  Tamar Christina  

* config/aarch64/aarch64.c
(aarch64_internal_mov_immediate): Add new special pattern.
* config/aarch64/aarch64.md (*movdi_aarch64):
Add reg/32bit const mov case.

gcc/testsuite/
2017-06-07  Tamar Christina  

* gcc.target/aarch64/int_mov_immediate_1.c: New.diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index a99a13460c2314ca9b40f82a466b6d492c49db97..e91586fa03c64b22c4c8efdf7b98d48c0086126d 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1794,6 +1794,27 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
   return 1;
 }
 
+  val2 = val & 0x;
+  if (mode == DImode
+  && aarch64_move_imm (val2, SImode)
+  && (((val >> 32) & 0x) == 0 || (val >> 48) == 0))
+{
+  if (generate)
+	emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
+
+  /* Check if we have to emit a second instruction.  */
+  if (val == val2)
+	return 1;
+
+  i = (val >> 48) ? 48 : 32;
+
+  if (generate)
+	 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
+GEN_INT ((val >> i) & 0x)));
+
+  return 2;
+}
+
   if ((val >> 32) == 0 || mode == SImode)
 {
   if (generate)
@@ -1810,7 +1831,6 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
 }
 
   /* Remaining cases are all for DImode.  */
-
   mask = 0x;
   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index fdba2d0adde2ef9e8519f6321f7456517c5e916a..5fcf809ae47552395667647e7299dcfe4ebdf7dd 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1060,8 +1060,8 @@
 )
 
 (define_insn_and_split "*movdi_aarch64"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,*w, m, m,r,r  ,*w, r,*w,w")
-	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,n,m, m,rZ,*w,S,Ush,rZ,*w,*w,Dd"))]
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r,*w, m, m,r,r  ,*w, r,*w,w")
+	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,m, m,rZ,*w,S,Ush,rZ,*w,*w,Dd"))]
   "(register_operand (operands[0], DImode)
 || aarch64_reg_or_zero (operands[1], DImode))"
   "@
@@ -1069,6 +1069,7 @@
mov\\t%0, %x1
mov\\t%x0, %1
mov\\t%x0, %1
+   mov\\t%w0, %1
#
ldr\\t%x0, %1
ldr\\t%d0, %1
@@ -1087,10 +1088,10 @@
aarch64_expand_mov_immediate (operands[0], operands[1]);
DONE;
 }"
-  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,load1,load1,store1,store1,\
-		adr,adr,f_mcr,f_mrc,fmov,neon_move")
-   (set_attr "fp" "*,*,*,*,*,*,yes,*,yes,*,*,yes,yes,yes,*")
-   (set_attr "simd" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,yes")]
+  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm, mov_imm,mov_imm,load1,\
+		load1,store1,store1,adr,adr,f_mcr,f_mrc,fmov,neon_move")
+   (set_attr "fp" "*,*,*,*,*,*,*,yes,*,yes,*,*,yes,yes,yes,*")
+   (set_attr "simd" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,yes")]
 )
 
 (define_insn "insv_imm"
diff --git a/gcc/testsuite/gcc.target/aarch64/int_mov_immediate_1.c b/gcc/testsuite/gcc.target/aarch64/int_mov_immediate_1.c
new file mode 100644
index ..6ac9065037f881c96ca81661a7d717133c6cc83d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/int_mov_immediate_1.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O3" } */
+
+long long f1(void)
+{
+  return 0x;
+}
+
+int f3(void)
+{
+  return 0x;
+}
+
+
+long f2(void)
+{
+  return 0x;
+}
+
+long f4(void)
+{
+  return 0x0001;
+}
+
+long f5(void)
+{
+  return 0x1ff;
+}
+
+long f6(void)
+{
+  return 0x;
+}
+
+long f7(void)
+{
+  return 0x;
+}
+
+long f8(void)
+{
+  return 0x0f00;
+}
+
+/* { dg-final { scan-assembler-times "mov\tw\[0-9\]+, -39322"  1 } } */
+/* { dg-final {

[PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access

2017-06-07 Thread Tamar Christina

Hi All, 


This patch allows larger bitsizes to be used as copy size
when the target does not have SLOW_UNALIGNED_ACCESS.

It also provides an optimized routine for MEM to REG
copying which avoid reconstructing the value piecewise on the stack
and instead uses a combination of shifts and ORs.

This now generates

adrpx0, .LANCHOR0
add x0, x0, :lo12:.LANCHOR0
sub sp, sp, #16
ldr w1, [x0, 120]
str w1, [sp, 8]
ldr x0, [x0, 112]
ldr x1, [sp, 8]
add sp, sp, 16

instead of:

adrpx3, .LANCHOR0
add x3, x3, :lo12:.LANCHOR0
mov x0, 0
mov x1, 0
sub sp, sp, #16
ldr x2, [x3, 112]
ldr w3, [x3, 120]
add sp, sp, 16
ubfxx5, x2, 8, 8
bfi x0, x2, 0, 8
ubfxx4, x2, 16, 8
lsr w9, w2, 24
bfi x0, x5, 8, 8
ubfxx7, x2, 32, 8
ubfxx5, x2, 40, 8
ubfxx8, x3, 8, 8
bfi x0, x4, 16, 8
bfi x1, x3, 0, 8
ubfxx4, x2, 48, 8
ubfxx6, x3, 16, 8
bfi x0, x9, 24, 8
bfi x1, x8, 8, 8
lsr x2, x2, 56
lsr w3, w3, 24
bfi x0, x7, 32, 8
bfi x1, x6, 16, 8
bfi x0, x5, 40, 8
bfi x1, x3, 24, 8
bfi x0, x4, 48, 8
bfi x0, x2, 56, 8

To load a 12 1-byte element struct.

and

adrpx0, .LANCHOR0
add x0, x0, :lo12:.LANCHOR0
sub sp, sp, #16
ldrbw1, [x0, 18]
ldrhw0, [x0, 16]
orr w0, w0, w1, lsr 16
str w0, [sp, 8]
add sp, sp, 16

instead of

adrpx2, .LANCHOR0
add x2, x2, :lo12:.LANCHOR0
mov x0, 0
sub sp, sp, #16
ldrhw1, [x2, 16]
ldrbw2, [x2, 18]
add sp, sp, 16
bfi x0, x1, 0, 8
ubfxx1, x1, 8, 8
bfi x0, x1, 8, 8
bfi x0, x2, 16, 8

These changes only have an effect on structures smaller than 16 bytes.

The remaining stores come from an existing incomplete data-flow analysis
which thinks the value on the stack is being used and doesn't mark
the value as dead.

Regression tested on aarch64-none-linux-gnu and x86_64-pc-linux-gnu and no 
regressions.

OK for trunk?

Thanks,
Tamar


gcc/
2017-06-07  Tamar Christina  

* expr.c (copy_blkmode_to_reg): Fix bitsize for targets
with fast unaligned access.
* config/aarch64/aarch64.c (aarch64_expand_movmem):
Add MEM to REG optimized case.diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4f769a40a4e9de83cb5aacfd3ff58301c2feeb78..8906d9a9445ed36f43302708d1f6212bcf017bdc 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13498,6 +13498,41 @@ aarch64_expand_movmem (rtx *operands)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
+  /* Optimize routines for MEM to REG copies.  */
+  if (n < 8 && !REG_P (XEXP (operands[0], 0)))
+   {
+ unsigned int max_align = UINTVAL (operands[2]);
+ max_align = n < max_align ? max_align : n;
+ machine_mode mov_mode, dest_mode
+   = smallest_mode_for_size (max_align * BITS_PER_UNIT, MODE_INT);
+ rtx result = gen_reg_rtx (dest_mode);
+ emit_insn (gen_move_insn (result, GEN_INT (0)));
+
+ unsigned int shift_cnt = 0;
+ for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode))
+   {
+	 int nearest = 0;
+	 /* Find the mode to use, but limit the max to TI mode.  */
+	 for (unsigned max = 1; max <= (n - shift_cnt) && max <= 16; max *= 2)
+	  nearest = max;
+
+	  mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT, MODE_INT);
+	  rtx reg = gen_reg_rtx (mov_mode);
+
+	  src = adjust_address (src, mov_mode, 0);
+	  emit_insn (gen_move_insn (reg, src));
+	  src = aarch64_progress_pointer (src);
+
+	  reg = gen_rtx_ASHIFT (dest_mode, reg,
+GEN_INT (shift_cnt * BITS_PER_UNIT));
+	  result = gen_rtx_IOR (dest_mode, reg, result);
+  }
+
+dst = adjust_address (dst, dest_mode, 0);
+emit_insn (gen_move_insn (dst, result));
+return true;
+  }
+
   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
  1-byte chunk.  */
   if (n < 4)
diff --git a/gcc/expr.c b/gcc/expr.c
index 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce8ee5a19297ab16 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode, tree src)
 
   n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
   dst_words = XALLOCAVEC (rtx, n_regs);
-  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
+  bitsize = BITS_PER_WORD;
+  if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE (src
+bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);

Re: [PATCH GCC8][31/33]Set range information for niter bound of vectorized loop

2017-06-07 Thread Bin.Cheng

On Wed, May 24, 2017 at 2:48 PM, Richard Biener
 wrote:
> On Mon, May 22, 2017 at 7:13 PM, Bin.Cheng  wrote:
>> On Fri, May 19, 2017 at 1:51 PM, Richard Biener
>>  wrote:
>>> On Mon, May 15, 2017 at 5:58 PM, Bin.Cheng  wrote:
 On Thu, May 11, 2017 at 12:02 PM, Richard Biener
  wrote:
> On Tue, Apr 18, 2017 at 12:54 PM, Bin Cheng  wrote
>> Hi,
>> Based on vect_peeling algorithm, we know for sure that vectorized loop 
>> must iterates at least once.
>> This patch sets range information for niter bounds of vectorized loop.  
>> This helps niter analysis,
>> so iv elimination too.
>> Is it OK?
>
>niters_vector = force_gimple_operand (niters_vector, , true, 
> var);
>gsi_insert_seq_on_edge_immediate (pe, stmts);
> +  /* Peeling algorithm guarantees that vector loop bound is at least 
> ONE,
> +we set range information to make niters analyzer's life easier.  
> */
> +  if (TREE_CODE (niters_vector) == SSA_NAME)
> +   set_range_info (niters_vector, VR_RANGE, build_int_cst (type, 1),
> +   fold_build2 (RSHIFT_EXPR, type,
> +TYPE_MAX_VALUE (type), log_vf));
>
> if all of niters_vector folds to an original SSA name then
> niters_vector after gimplification
> is not a new SSA name and thus you can't set range-info on it.
>
> Likewise for the other case where LOOP_VINFO_NITERS is just an SSA name.
 Hi,
 This is updated patch.  It checks whether the result ssa name is newly
 created tmp and only sets range information if so.

 Is it OK?
>>>
>>> A better way to check whether the SSA name is new is to see if 'stmts'
>>> filled by force_gimple_operand is non-empty.
>> Hi,
>> Here is updated patch checking empty gimple_seq.  Is it OK?
>
> Ok.  As a bonus you could have used wide-ints to feed set_range_info
> (not sure if wide_int_ref & allows you to pass '1' literally...)
Patch applied @r248958.  Being lazy not using wide_int here, I will
pick up this after current work.

Thanks,
bin
>
> Richard.
>
>> Thanks,
>> bin
>>>
>>> Richard.
>>>
 Thanks,
 bin

 2017-04-11  Bin Cheng  

 * tree-vectorizer.h (vect_build_loop_niters): New parameter.
 * tree-vect-loop-manip.c (vect_build_loop_niters): New parameter.
 Set true to new parameter if new ssa variable is defined.
 (vect_gen_vector_loop_niters): Refactor.  Set range information
 for the new vector loop bound variable.
 (vect_do_peeling): Ditto.

>
> Richard.
>
>> Thanks,
>> bin
>> 2017-04-11  Bin Cheng  
>>
>> * tree-vect-loop-manip.c (vect_gen_vector_loop_niters): Refactor.
>> Set range information for vector loop bound variable.
>> (vect_do_peeling): Ditto.

Re: [PATCH GCC8][32/33]Save niter check for vect peeling if loop versioning is required

2017-06-07 Thread Bin.Cheng

On Thu, May 11, 2017 at 12:06 PM, Richard Biener
 wrote:
> On Tue, Apr 18, 2017 at 12:54 PM, Bin Cheng  wrote:
>> Hi,
>> When loop versioning is required in vectorization, we can merge niter check 
>> for vect
>> peeling with the check for loop versioning, thus save one check/branch for 
>> vectorized
>> loop.
>> Is it OK?
>
> Ok.
Applied @r248959.

Thanks,
bin
>
> Thanks,
> Richard.
>
>> Thanks,
>> bin
>> 2017-04-11  Bin Cheng  
>>
>> * tree-vect-loop-manip.c (vect_do_peeling): Don't skip vector loop
>> if versioning is required.
>> * tree-vect-loop.c (vect_analyze_loop_2): Merge niter check for loop
>> peeling with the check for versioning.

Re: [PATCH] PR c++/80990 use cv-qualifiers in class template argument deduction

2017-06-07 Thread Nathan Sidwell


On 06/06/2017 03:07 PM, Jonathan Wakely wrote:

This fixes class template argument deduction so that cv-qualifiers are
not ignored.

Bootstrapped and tested powerpc64le-linux. OK for trunk?

 PR c++/80990
 * pt.c (do_class_deduction): Build qualified type.


ok


--
Nathan Sidwell

Re: [PATCH GCC8][30/33]Fold more type conversion into binary arithmetic operations

2017-06-07 Thread Bin.Cheng

On Wed, May 17, 2017 at 1:27 PM, Richard Biener
 wrote:
> On Mon, May 15, 2017 at 5:56 PM, Bin.Cheng  wrote:
>> On Thu, May 11, 2017 at 11:54 AM, Richard Biener
>>  wrote:
>>> On Tue, Apr 18, 2017 at 12:53 PM, Bin Cheng  wrote:
 Hi,
 Simplification of (T1)(X *+- CST) is already implemented in 
 aff_combination_expand,
 this patch moves it to tree_to_aff_combination.  It also supports unsigned 
 types
 if range information allows the transformation, as well as special case 
 (T1)(X + X).
 Is it OK?
>>>
>>> Can you first please simply move it?
>>>
>>> +   /* In case X's type has wrapping overflow behavior, we can still
>>> +  convert (T1)(X - CST) into (T1)X - (T1)CST if X - CST doesn't
>>> +  overflow by range information.  Also Convert (T1)(X + CST) as
>>> +  if it's (T1)(X - (-CST)).  */
>>> +   if (TYPE_UNSIGNED (itype)
>>> +   && TYPE_OVERFLOW_WRAPS (itype)
>>> +   && TREE_CODE (op0) == SSA_NAME
>>> +   && TREE_CODE (op1) == INTEGER_CST
>>> +   && (icode == PLUS_EXPR || icode == MINUS_EXPR)
>>> +   && get_range_info (op0, , ) == VR_RANGE)
>>> + {
>>> +   if (icode == PLUS_EXPR)
>>> + op1 = fold_build1 (NEGATE_EXPR, itype, op1);
>>>
>>> Negating -INF will produce -INF(OVF) which we don't want to have in our IL,
>>> I suggest to use
>>>
>>>   op1 = wide_int_to_tree (itype, wi::neg (op1));
>>>
>>> instead.
>>>
>>> +   if (wi::geu_p (minv, op1))
>>> + {
>>> +   op0 = fold_convert (otype, op0);
>>> +   op1 = fold_convert (otype, op1);
>>> +   expr = fold_build2 (MINUS_EXPR, otype, op0, op1);
>>> +   tree_to_aff_combination (expr, type, comb);
>>> +   return;
>>> + }
>>> + }
>>>
>>> I think this is similar to a part of what Robin Dapp (sp?) is
>>> proposing as fix for PR69526?
>>>
>>> The same trick should work for (int)((unsigned)X - CST) with different
>>> overflow checks
>>> (you need to make sure the resulting expr does not overflow).
>> Hi,
>> As suggested, I separated the patch into three.  Other review comments
>> are also addressed.
>> I read Robin's PR and patch, I think it's two different issues sharing
>> some aspects, for example, the overflow check using range information
>> are quite the same.  In effect, this should also captures the result
>> of Robin's patch because we don't want to fold (T1)(x +- CST) in
>> general, but here in tree-affine.
>>
>> Bootstrap and test, is it OK?
>
> Ok.  Please commit as separate revisions.
Three patches applied separately @r248955/r248956/r2489557.

Thanks,
bin
>
> Thanks,
> Richard.
>
>> Part1:
>> 2017-04-11  Bin Cheng  
>>
>> (aff_combination_expand): Move (T1)(X *+- CST) simplification to ...
>> (tree_to_aff_combination): ... here.
>>
>> Part2:
>> 2017-04-11  Bin Cheng  
>>
>> * tree-affine.c (tree_to_aff_combination): Handle (T1)(X + X).
>>
>> Part3:
>> 2017-04-11  Bin Cheng  
>>
>> * tree-affine.c (ssa.h): Include header file.
>> (tree_to_aff_combination): Handle (T1)(X - CST) when inner type
>> has wrapping overflow behavior.
>>
>> Thanks,
>> bin
>>>
>>> Richard.
>>>
>>>
 Thanks,
 bin
 2017-04-11  Bin Cheng  

 * tree-affine.c: Include header file.
 (aff_combination_expand): Move (T1)(X *+- CST) simplification to 
 ...
 (tree_to_aff_combination): ... here.  Support (T1)(X + X) case, and
 unsigned type case if range information allows.

Re: [PATCH GCC8][29/33]New register pressure estimation

2017-06-07 Thread Bin.Cheng

On Wed, May 17, 2017 at 1:24 PM, Richard Biener
 wrote:
> On Mon, May 15, 2017 at 5:50 PM, Bin.Cheng  wrote:
>> On Thu, May 11, 2017 at 11:39 AM, Richard Biener
>>  wrote:
>>> On Tue, Apr 18, 2017 at 12:53 PM, Bin Cheng  wrote:
 Hi,
 Currently IVOPTs shares the same register pressure computation with RTL 
 loop invariant pass,
 which doesn't work very well.  This patch introduces specific interface 
 for IVOPTs.
 The general idea is described in the cover message as below:
   C) Current implementation shares the same register pressure computation 
 with RTL loop
  inv pass.  It has difficulty in handling (especially large) loop 
 nest, and quite
  often generating too many candidates (especially for outer loops).  
 This change
  introduces new register pressure computation.  The brief idea is to 
 differentiate
  (hot) innermost loop and outer loop.  for (possibly hot) inner most, 
 more registers
  are allowed as long as the register pressure is within the range of 
 number of target
  available registers.
 It can also help to restrict number of candidates for outer loop.
 Is it OK?
>>>
>>> +/* Determine if current loop is the innermost loop and maybe hot.  */
>>> +
>>> +static void
>>> +determine_hot_innermost_loop (struct ivopts_data *data)
>>> +{
>>> +  data->hot_innermost_loop_p = true;
>>> +  if (!data->speed)
>>> +return;
>>>
>>> err, so when not optimizing for speed we assume all loops (even not 
>>> innermost)
>>> are hot and innermost?!
>>>
>>> +  HOST_WIDE_INT niter = avg_loop_niter (loop);
>>> +  if (niter < PARAM_VALUE (PARAM_AVG_LOOP_NITER)
>>> +  || loop_constraint_set_p (loop, LOOP_C_PROLOG)
>>> +  || loop_constraint_set_p (loop, LOOP_C_EPILOG)
>>> +  || loop_constraint_set_p (loop, LOOP_C_VERSION))
>>> +data->hot_innermost_loop_p = false;
>>>
>>> this needs adjustment for the constraint patch removal.  Also looking at 
>>> niter
>>> of the loop in question insn't a good metric for hotness.  data->speed is 
>>> the
>>> best guess you get I think (optimize_loop_for_speed_p).
>>>
>>>data->speed = optimize_loop_for_speed_p (loop);
>>> +  determine_hot_innermost_loop (data);
>>>
>>>   data->hot_innermost_loop_p = determine_hot_innermost_loop (data);
>>>
>>> would be more consistent here.
>> Hi,
>> I removed the hot innermost part and here is the updated version.  Is it OK?
>
> Ok.
Sorry for the long time delay, I committed the patch @r248954

Thanks,
bin
>
>> Thanks,
>> bin
>>
>> 2017-05-11  Bin Cheng  
>>
>> * tree-ssa-loop-ivopts.c (ivopts_estimate_reg_pressure): New
>> reg_pressure model function.
>> (ivopts_global_cost_for_size): Delete.
>> (determine_set_costs, iv_ca_recount_cost): Call new model function
>> ivopts_estimate_reg_pressure.
>>
>>>
>>> Thanks,
>>> Richard.
>>>
 Thanks,
 bin
 2017-04-11  Bin Cheng  

 * tree-ssa-loop-ivopts.c (struct ivopts_data): New field.
 (ivopts_estimate_reg_pressure): New reg_pressure model function.
 (ivopts_global_cost_for_size): Delete.
 (determine_set_costs, iv_ca_recount_cost): Call new model function
 ivopts_estimate_reg_pressure.
 (determine_hot_innermost_loop): New.
 (tree_ssa_iv_optimize_loop): Call above function.

Re: [PATCH 0/5 v3] Vect peeling cost model

2017-06-07 Thread Robin Dapp

> http://gcc.gnu.org/ml/gcc-testresults/2017-06/msg00297.html

What machine is this running on? power4 BE? The tests are compiled with
--with-cpu-64=power4 apparently.  I cannot reproduce this on power7
-m32.  Is it possible to get more detailed logs or machine access to
reproduce?

Regards
 Robin

Re: [PATCH GCC][4/5]Improve loop distribution to handle hmmer

2017-06-07 Thread Bin.Cheng

On Wed, Jun 7, 2017 at 11:03 AM, Richard Biener
 wrote:
> On Fri, Jun 2, 2017 at 1:51 PM, Bin Cheng  wrote:
>> Hi,
>> This is the main patch of the change.  It improves loop distribution by 
>> versioning loop under
>> runtime alias check conditions, as well as better partition fusion.  As 
>> described in comments,
>> the patch basically implements distribution in the following steps:
>>
>>  1) Seed partitions with specific type statements.  For now we support
>> two types seed statements: statement defining variable used outside
>> of loop; statement storing to memory.
>>  2) Build reduced dependence graph (RDG) for loop to be distributed.
>> The vertices (RDG:V) model all statements in the loop and the edges
>> (RDG:E) model flow and control dependences between statements.
>>  3) Apart from RDG, compute data dependences between memory references.
>>  4) Starting from seed statement, build up partition by adding depended
>> statements according to RDG's dependence information.  Partition is
>> classified as parallel type if it can be executed parallelly; or as
>> sequential type if it can't.  Parallel type partition is further
>> classified as different builtin kinds if it can be implemented as
>> builtin function calls.
>>  5) Build partition dependence graph (PG) based on data dependences.
>> The vertices (PG:V) model all partitions and the edges (PG:E) model
>> all data dependences between every partitions pair.  In general,
>> data dependence is either compilation time known or unknown.  In C
>> family languages, there exists quite amount compilation time unknown
>> dependences because of possible alias relation of data references.
>> We categorize PG's edge to two types: "true" edge that represents
>> compilation time known data dependences; "alias" edge for all other
>> data dependences.
>>  6) Traverse subgraph of PG as if all "alias" edges don't exist.  Merge
>> partitions in each strong connected commponent (SCC) correspondingly.
>> Build new PG for merged partitions.
>>  7) Traverse PG again and this time with both "true" and "alias" edges
>> included.  We try to break SCCs by removing some edges.  Because
>> SCCs by "true" edges are all fused in step 6), we can break SCCs
>> by removing some "alias" edges.  It's NP-hard to choose optimal
>> edge set, fortunately simple approximation is good enough for us
>> given the small problem scale.
>>  8) Collect all data dependences of the removed "alias" edges.  Create
>> runtime alias checks for collected data dependences.
>>  9) Version loop under the condition of runtime alias checks.  Given
>> loop distribution generally introduces additional overhead, it is
>> only useful if vectorization is achieved in distributed loop.  We
>> version loop with internal function call IFN_LOOP_DIST_ALIAS.  If
>> no distributed loop can be vectorized, we simply remove distributed
>> loops and recover to the original one.
>>
>> Also, there are some more to improve in the future (which shouldn't be 
>> difficult):
>>TODO:
>>  1) We only distribute innermost loops now.  This pass should handle loop
>> nests in the future.
>>  2) We only fuse partitions in SCC now.  A better fusion algorithm is
>> desired to minimize loop overhead, maximize parallelism and maximize
>>
>> This patch also fixes couple of latent bugs in the original implementation.
>
> It would be nice to split this patch up, for example fixing the latent
> bugs first
> (so we can backport that part).
I only remember the major one in which topological order is needed
when sorting statements, dominance order is not enough.  I will try to
split it as much as possible, but the change falls in a big chunk to
some extend.

>
> You now compute _all_ dependences in the loop while I originally designed loop
> distribution to do less dependence computation by only computing dependences
> between datarefs in different partitions (and delaying that until
> after partition fusing
> required by things not requiring dependence info).
> Please do not remove this optimization.
Right, I did that and cache it in hash table because we need to query
dependence for two references multiple times.  I will restore the
on-demand computing behavior.

Thanks,
bin
>
> Richard.
>
>> After this change, kernel loop in hmmer can be distributed and vectorized as 
>> a result.
>> This gives obvious performance improvement.  There is still inefficient code 
>> generation
>> issue which I will try to fix in loop split.  Apart from this, the next 
>> opportunity in hmmer
>> is to eliminate number of dead stores under proper alias information.
>> Bootstrap and test at O2/O3 on x86_64 and AArch64.

Re: Handle data dependence relations with different bases

2017-06-07 Thread Richard Biener

On Wed, May 31, 2017 at 8:56 AM, Richard Sandiford
 wrote:
> Ping
>
> Richard Sandiford  writes:
>> Richard Biener  writes:
>>> On Thu, May 4, 2017 at 7:21 PM, Richard Sandiford
>>>  wrote:
 Richard Biener  writes:
> On Thu, May 4, 2017 at 2:12 PM, Richard Biener
>  wrote:
>> On Wed, May 3, 2017 at 10:00 AM, Richard Sandiford
>>  wrote:
>>> This patch tries to calculate conservatively-correct distance
>>> vectors for two references whose base addresses are not the same.
>>> It sets a new flag DDR_COULD_BE_INDEPENDENT_P if the dependence
>>> isn't guaranteed to occur.
>>>
>>> The motivating example is:
>>>
>>>   struct s { int x[8]; };
>>>   void
>>>   f (struct s *a, struct s *b)
>>>   {
>>> for (int i = 0; i < 8; ++i)
>>>   a->x[i] += b->x[i];
>>>   }
>>>
>>> in which the "a" and "b" accesses are either independent or have a
>>> dependence distance of 0 (assuming -fstrict-aliasing).  Neither case
>>> prevents vectorisation, so we can vectorise without an alias check.
>>>
>>> I'd originally wanted to do the same thing for arrays as well, e.g.:
>>>
>>>   void
>>>   f (int a[][8], struct b[][8])
>>>   {
>>> for (int i = 0; i < 8; ++i)
>>>   a[0][i] += b[0][i];
>>>   }
>>>
>>> I think this is valid because C11 6.7.6.2/6 says:
>>>
>>>   For two array types to be compatible, both shall have compatible
>>>   element types, and if both size specifiers are present, and are
>>>   integer constant expressions, then both size specifiers shall have
>>>   the same constant value.
>>>
>>> So if we access an array through an int (*)[8], it must have type X[8]
>>> or X[], where X is compatible with int.  It doesn't seem possible in
>>> either case for "a[0]" and "b[0]" to overlap when "a != b".
>>>
>>> However, Richard B said that (at least in gimple) we support arbitrary
>>> overlap of arrays and allow arrays to be accessed with different
>>> dimensionality.  There are examples of this in PR50067.  I've therefore
>>> only handled references that end in a structure field access.
>>>
>>> There are two ways of handling these dependences in the vectoriser:
>>> use them to limit VF, or check at runtime as before.  I've gone for
>>> the approach of checking at runtime if we can, to avoid limiting VF
>>> unnecessarily.  We still fall back to a VF cap when runtime checks
>>> aren't allowed.
>>>
>>> The patch tests whether we queued an alias check with a dependence
>>> distance of X and then picked a VF <= X, in which case it's safe to
>>> drop the alias check.  Since vect_prune_runtime_alias_check_list can
>>> be called twice with different VF for the same loop, it's no longer
>>> safe to clear may_alias_ddrs on exit.  Instead we should use
>>> comp_alias_ddrs to check whether versioning is necessary.
>>>
>>> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?
>>
>> You seem to do your "fancy" thing but also later compute the old
>> base equality anyway (for same_base_p).  It looks to me for this
>> case the new fancy code can be simply skipped, keeping num_dimensions
>> as before?
>>
>> +  /* Try to approach equal type sizes.  */
>> +  if (!COMPLETE_TYPE_P (type_a)
>> + || !COMPLETE_TYPE_P (type_b)
>> + || !tree_fits_uhwi_p (TYPE_SIZE_UNIT (type_a))
>> + || !tree_fits_uhwi_p (TYPE_SIZE_UNIT (type_b)))
>> +   break;
>>
>> ah, interesting idea to avoid a quadratic search.  Note that you should
>> conservatively handle both BIT_FIELD_REF and VIEW_CONVERT_EXPR
>> as they are used for type-punning.

 All the component refs here should be REALPART_EXPRs, IMAGPART_EXPRs,
 ARRAY_REFs or COMPONENT_REFs of structures, since that's all that
 dr_analyze_indices allows, so I think we safe in terms of the tree codes.
>>>
>>> Yeah.  I think we need to document that we should have a 1:1 match here.
>>
>> OK, I added that to the comments and also added an access_fn_component_p
>> that we can assert on.
>>
>> I see nonoverlapping_component_refs_of_decl_p should simply skip
>> ARRAY_REFs - but I also see there:
>>
>>   /* ??? We cannot simply use the type of operand #0 of the refs here
>>  as the Fortran compiler smuggles type punning into 
>> COMPONENT_REFs
>>  for common blocks instead of using unions like everyone else.  
>> */
>>   tree type1 = DECL_CONTEXT (field1);
>>   tree type2 = DECL_CONTEXT (field2);
>>
>> so you probably can't simply use TREE_TYPE (outer_ref) for type

RE: [PATCH][GCC][ARM] Adjust costs so udiv is preferred over sdiv when both are valid. [Patch (2/2)]

2017-06-07 Thread Tamar Christina

Hi Kyrill,

> diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index
> b24143e32e2f10f3b150f7ed0df4fabb3cc8..ecc7688b1db6309a4dd694a8e
> 254e64abe14d7e3 100644
> --- a/gcc/config/arm/arm.c
> +++ b/gcc/config/arm/arm.c
> @@ -9258,6 +9258,8 @@ arm_rtx_costs_internal (rtx x, enum rtx_code
> code, enum rtx_code outer_code,
>   *cost += COSTS_N_INSNS (speed_p ? extra_cost->mult[0].idiv : 0);
> else
>   *cost = LIBCALL_COST (2);
> +
> +  *cost += (code == DIV ? 1 : 0);
> return false; /* All arguments must be in registers.  */
> 
> 
> We usually try to avoid adjusting the costs in units other than
> COSTS_N_INSNS.
> Would adding COSTS_N_INSNS (1) here work?
> If so, could you also add a comment here to describe why we're adjusting the
> cost.

It would, I'm just slightly worried it might end up generating different code 
for DIV then.
The reason I have used a unit smaller than COSTS_N_INSNS it so that it should 
have any real
Impact on any other optimization as the cost is likely treated as an integer? 
It's only for things that
Compare the costs values between signed and unsigned would the small unit make 
a difference.

Since I think the compiler still has some hard coded cost limits somewhere it 
may be an issue, but I'm not
100% certain. I can make the change though.

> 
>   case MOD:
> @@ -9280,7 +9282,7 @@ arm_rtx_costs_internal (rtx x, enum rtx_code
> code, enum rtx_code outer_code,
> 
>   /* Fall-through.  */
>   case UMOD:
> -  *cost = LIBCALL_COST (2);
> +  *cost = LIBCALL_COST (2) + (code == MOD ? 1 : 0);
> 
> Same here.
> 
> Thanks,
> Kyrill
>

Re: Use base inequality for some vector alias checks

2017-06-07 Thread Richard Biener

On Wed, May 31, 2017 at 8:56 AM, Richard Sandiford
 wrote:
> Ping
>
> Richard Sandiford  writes:
>> This patch checks whether two data references x and y cannot
>> partially overlap and so are independent whenever  != 
>> We can then use this in the vectoriser to optimise alias checks.
>>
>> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

Looks good to me.  Probably needs refactoring now if Bin was faster
with factoring out the machinery to elsewhere.

Thanks,
Richard.

>> Thanks,
>> Richard
>>
>>
>> gcc/
>> 2016-05-03  Richard Sandiford  
>>
>>   * hash-traits.h (pair_hash): New struct.
>>   * tree-data-ref.h (data_dependence_relation): Add object_a and
>>   object_b fields.
>>   (DDR_OBJECT_A, DDR_OBJECT_B): New macros.
>>   * tree-data-ref.c (initialize_data_dependence_relation): Initialize
>>   DDR_OBJECT_A and DDR_OBJECT_B.
>>   * tree-vectorizer.h (vec_object_pair): New type.
>>   (_loop_vec_info): Add a check_unequal_addrs field.
>>   (LOOP_VINFO_CHECK_UNEQUAL_ADDRS): New macro.
>>   (LOOP_REQUIRES_VERSIONING_FOR_ALIAS): Return true if there is an
>>   entry in check_unequal_addrs.  Check comp_alias_ddrs instead of
>>   may_alias_ddrs.
>>   * tree-vect-loop.c (destroy_loop_vec_info): Release
>>   LOOP_VINFO_CHECK_UNEQUAL_ADDRS.
>>   (vect_analyze_loop_2): Likewise, when restarting.
>>   (vect_estimate_min_profitable_iters): Estimate the cost of
>>   LOOP_VINFO_CHECK_UNEQUAL_ADDRS.
>>   * tree-vect-data-refs.c: Include tree-hash-traits.h.
>>   (vect_prune_runtime_alias_test_list): Try to handle conflicts
>>   using LOOP_VINFO_CHECK_UNEQUAL_ADDRS, if the data dependence allows.
>>   Count such tests in the final summary.
>>   * tree-vect-loop-manip.c (chain_cond_expr): New function.
>>   (vect_create_cond_for_align_checks): Use it.
>>   (vect_create_cond_for_alias_checks): Likewise.
>>   (vect_create_cond_for_unequal_addrs): New function.
>>   (vect_loop_versioning): Call it.
>>
>> gcc/testsuite/
>>   * gcc.dg/vect/vect-alias-check-6.c: New test.
>>
>> Index: gcc/hash-traits.h
>> ===
>> --- gcc/hash-traits.h 2017-02-23 19:54:15.0 +
>> +++ gcc/hash-traits.h 2017-05-03 08:48:53.312035228 +0100
>> @@ -301,6 +301,76 @@ struct ggc_cache_ptr_hash : pointer_hash
>>
>>  struct nofree_string_hash : string_hash, typed_noop_remove  
>> {};
>>
>> +/* Traits for pairs of values, using the first to record empty and
>> +   deleted slots.  */
>> +
>> +template 
>> +struct pair_hash
>> +{
>> +  typedef std::pair > +  typename T2::value_type> value_type;
>> +  typedef std::pair > +  typename T2::compare_type> compare_type;
>> +
>> +  static inline hashval_t hash (const value_type &);
>> +  static inline bool equal (const value_type &, const compare_type &);
>> +  static inline void remove (value_type &);
>> +  static inline void mark_deleted (value_type &);
>> +  static inline void mark_empty (value_type &);
>> +  static inline bool is_deleted (const value_type &);
>> +  static inline bool is_empty (const value_type &);
>> +};
>> +
>> +template 
>> +inline hashval_t
>> +pair_hash ::hash (const value_type )
>> +{
>> +  return iterative_hash_hashval_t (T1::hash (x.first), T2::hash (x.second));
>> +}
>> +
>> +template 
>> +inline bool
>> +pair_hash ::equal (const value_type , const compare_type )
>> +{
>> +  return T1::equal (x.first, y.first) && T2::equal (x.second, y.second);
>> +}
>> +
>> +template 
>> +inline void
>> +pair_hash ::remove (value_type )
>> +{
>> +  T1::remove (x.first);
>> +  T2::remove (x.second);
>> +}
>> +
>> +template 
>> +inline void
>> +pair_hash ::mark_deleted (value_type )
>> +{
>> +  T1::mark_deleted (x.first);
>> +}
>> +
>> +template 
>> +inline void
>> +pair_hash ::mark_empty (value_type )
>> +{
>> +  T1::mark_empty (x.first);
>> +}
>> +
>> +template 
>> +inline bool
>> +pair_hash ::is_deleted (const value_type )
>> +{
>> +  return T1::is_deleted (x.first);
>> +}
>> +
>> +template 
>> +inline bool
>> +pair_hash ::is_empty (const value_type )
>> +{
>> +  return T1::is_empty (x.first);
>> +}
>> +
>>  template  struct default_hash_traits : T {};
>>
>>  template 
>> Index: gcc/tree-data-ref.h
>> ===
>> --- gcc/tree-data-ref.h   2017-05-03 08:48:48.737038502 +0100
>> +++ gcc/tree-data-ref.h   2017-05-03 08:48:53.313041828 +0100
>> @@ -240,6 +240,13 @@ struct data_dependence_relation
>> but the analyzer cannot be more specific.  */
>>tree are_dependent;
>>
>> +  /* If nonnull, COULD_BE_INDEPENDENT_P is true and the accesses are
>> + independent when the runtime addresses of OBJECT_A and OBJECT_B
>> + are different.  The addresses of

Re: [PATCH][GCC][ARM] Adjust costs so udiv is preferred over sdiv when both are valid. [Patch (2/2)]

2017-06-07 Thread Kyrill Tkachov



On 02/05/17 16:37, Tamar Christina wrote:

Hi All,

This patch adjusts the cost model so that when both sdiv and udiv are possible
it prefers udiv over sdiv. This was done by making sdiv slightly more expensive
instead of making udiv cheaper to keep the baseline costs of a division the same
as before.

Similar to aarch64 this patch along with my other two related mid-end changes
makes a big difference in division by constants.

Given:

int f2(int x)
{
  return ((x * x) % 300) + ((x * x) / 300);
}

we now generate

f2:
mul r3, r0, r0
mov r0, r3
ldr r1, .L3
umull   r2, r3, r0, r1
lsr r2, r3, #5
add r3, r2, r2, lsl #2
rsb r3, r3, r3, lsl #4
sub r0, r0, r3, lsl #2
add r0, r0, r2
bx  lr

as opposed to

f2:
mul r3, r0, r0
mov r0, r3
ldr r3, .L4
push{r4, r5}
smull   r4, r5, r0, r3
asr r3, r0, #31
rsb r3, r3, r5, asr #5
add r2, r3, r3, lsl #2
rsb r2, r2, r2, lsl #4
sub r0, r0, r2, lsl #2
add r0, r0, r3
pop {r4, r5}
bx  lr

Bootstrapped and reg tested on arm-none-eabi
with no regressions.

OK for trunk?

Thanks,
Tamar


gcc/
2017-05-02  Tamar Christina  

* config/arm/arm.c (arm_rtx_costs_internal): Make sdiv more expensive 
than udiv.


gcc/testsuite/
2017-05-02  Tamar Christina  

* gcc.target/arm/sdiv_costs_1.c: New.


diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 
b24143e32e2f10f3b150f7ed0df4fabb3cc8..ecc7688b1db6309a4dd694a8e254e64abe14d7e3
 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -9258,6 +9258,8 @@ arm_rtx_costs_internal (rtx x, enum rtx_code code, enum 
rtx_code outer_code,
*cost += COSTS_N_INSNS (speed_p ? extra_cost->mult[0].idiv : 0);
   else
*cost = LIBCALL_COST (2);
+
+  *cost += (code == DIV ? 1 : 0);
   return false;/* All arguments must be in registers.  */
 


We usually try to avoid adjusting the costs in units other than COSTS_N_INSNS.
Would adding COSTS_N_INSNS (1) here work?
If so, could you also add a comment here to describe why we're adjusting the 
cost.

 case MOD:
@@ -9280,7 +9282,7 @@ arm_rtx_costs_internal (rtx x, enum rtx_code code, enum 
rtx_code outer_code,
 
 /* Fall-through.  */

 case UMOD:
-  *cost = LIBCALL_COST (2);
+  *cost = LIBCALL_COST (2) + (code == MOD ? 1 : 0);

Same here.

Thanks,
Kyrill

Re: [PATCH GCC][4/5]Improve loop distribution to handle hmmer

2017-06-07 Thread Richard Biener

On Fri, Jun 2, 2017 at 1:51 PM, Bin Cheng  wrote:
> Hi,
> This is the main patch of the change.  It improves loop distribution by 
> versioning loop under
> runtime alias check conditions, as well as better partition fusion.  As 
> described in comments,
> the patch basically implements distribution in the following steps:
>
>  1) Seed partitions with specific type statements.  For now we support
> two types seed statements: statement defining variable used outside
> of loop; statement storing to memory.
>  2) Build reduced dependence graph (RDG) for loop to be distributed.
> The vertices (RDG:V) model all statements in the loop and the edges
> (RDG:E) model flow and control dependences between statements.
>  3) Apart from RDG, compute data dependences between memory references.
>  4) Starting from seed statement, build up partition by adding depended
> statements according to RDG's dependence information.  Partition is
> classified as parallel type if it can be executed parallelly; or as
> sequential type if it can't.  Parallel type partition is further
> classified as different builtin kinds if it can be implemented as
> builtin function calls.
>  5) Build partition dependence graph (PG) based on data dependences.
> The vertices (PG:V) model all partitions and the edges (PG:E) model
> all data dependences between every partitions pair.  In general,
> data dependence is either compilation time known or unknown.  In C
> family languages, there exists quite amount compilation time unknown
> dependences because of possible alias relation of data references.
> We categorize PG's edge to two types: "true" edge that represents
> compilation time known data dependences; "alias" edge for all other
> data dependences.
>  6) Traverse subgraph of PG as if all "alias" edges don't exist.  Merge
> partitions in each strong connected commponent (SCC) correspondingly.
> Build new PG for merged partitions.
>  7) Traverse PG again and this time with both "true" and "alias" edges
> included.  We try to break SCCs by removing some edges.  Because
> SCCs by "true" edges are all fused in step 6), we can break SCCs
> by removing some "alias" edges.  It's NP-hard to choose optimal
> edge set, fortunately simple approximation is good enough for us
> given the small problem scale.
>  8) Collect all data dependences of the removed "alias" edges.  Create
> runtime alias checks for collected data dependences.
>  9) Version loop under the condition of runtime alias checks.  Given
> loop distribution generally introduces additional overhead, it is
> only useful if vectorization is achieved in distributed loop.  We
> version loop with internal function call IFN_LOOP_DIST_ALIAS.  If
> no distributed loop can be vectorized, we simply remove distributed
> loops and recover to the original one.
>
> Also, there are some more to improve in the future (which shouldn't be 
> difficult):
>TODO:
>  1) We only distribute innermost loops now.  This pass should handle loop
> nests in the future.
>  2) We only fuse partitions in SCC now.  A better fusion algorithm is
> desired to minimize loop overhead, maximize parallelism and maximize
>
> This patch also fixes couple of latent bugs in the original implementation.

It would be nice to split this patch up, for example fixing the latent
bugs first
(so we can backport that part).

You now compute _all_ dependences in the loop while I originally designed loop
distribution to do less dependence computation by only computing dependences
between datarefs in different partitions (and delaying that until
after partition fusing
required by things not requiring dependence info).
Please do not remove this optimization.

Richard.

> After this change, kernel loop in hmmer can be distributed and vectorized as 
> a result.
> This gives obvious performance improvement.  There is still inefficient code 
> generation
> issue which I will try to fix in loop split.  Apart from this, the next 
> opportunity in hmmer
> is to eliminate number of dead stores under proper alias information.
> Bootstrap and test at O2/O3 on x86_64 and AArch64.  is it OK?
>
> Thanks,
> bin
> 2017-05-31  Bin Cheng  
>
> * cfgloop.h (struct loop): New field ldist_alias_id.
> * cfgloopmanip.c (lv_adjust_loop_entry_edge): Refine comment for
> new internal function.
> * internal-fn.c (expand_LOOP_DIST_ALIAS): New function.
> * internal-fn.def (IFN_LOOP_DIST_ALIAS): New internal function.
> * tree-loop-distribution.c: Add general explanantion on the pass.
> Include header file.
> (struct ddr_entry, struct ddr_entry_hasher): New structs.
>

Re: [PATCH][GCC][ARM] Adjust costs so udiv is preferred over sdiv when both are valid. [Patch (2/2)]

2017-06-07 Thread Tamar Christina

Ping

From: gcc-patches-ow...@gcc.gnu.org  on behalf 
of Tamar Christina 
Sent: Monday, May 15, 2017 9:32:55 AM
To: GCC Patches
Cc: nd; Kyrylo Tkachov; Ramana Radhakrishnan; Richard Earnshaw; ni...@redhat.com
Subject: Re: [PATCH][GCC][ARM] Adjust costs so udiv is preferred over sdiv when 
both are valid. [Patch (2/2)]

Ping

From: gcc-patches-ow...@gcc.gnu.org  on behalf 
of Tamar Christina 
Sent: Tuesday, May 2, 2017 4:37:12 PM
To: GCC Patches
Cc: nd; Kyrylo Tkachov; Ramana Radhakrishnan; Richard Earnshaw; ni...@redhat.com
Subject: [PATCH][GCC][ARM] Adjust costs so udiv is preferred over sdiv when 
both are valid. [Patch (2/2)]

Hi All,

This patch adjusts the cost model so that when both sdiv and udiv are possible
it prefers udiv over sdiv. This was done by making sdiv slightly more expensive
instead of making udiv cheaper to keep the baseline costs of a division the same
as before.

Similar to aarch64 this patch along with my other two related mid-end changes
makes a big difference in division by constants.

Given:

int f2(int x)
{
  return ((x * x) % 300) + ((x * x) / 300);
}

we now generate

f2:
mul r3, r0, r0
mov r0, r3
ldr r1, .L3
umull   r2, r3, r0, r1
lsr r2, r3, #5
add r3, r2, r2, lsl #2
rsb r3, r3, r3, lsl #4
sub r0, r0, r3, lsl #2
add r0, r0, r2
bx  lr

as opposed to

f2:
mul r3, r0, r0
mov r0, r3
ldr r3, .L4
push{r4, r5}
smull   r4, r5, r0, r3
asr r3, r0, #31
rsb r3, r3, r5, asr #5
add r2, r3, r3, lsl #2
rsb r2, r2, r2, lsl #4
sub r0, r0, r2, lsl #2
add r0, r0, r3
pop {r4, r5}
bx  lr

Bootstrapped and reg tested on arm-none-eabi
with no regressions.

OK for trunk?

Thanks,
Tamar

gcc/
2017-05-02  Tamar Christina  

* config/arm/arm.c (arm_rtx_costs_internal): Make sdiv more expensive 
than udiv.

gcc/testsuite/
2017-05-02  Tamar Christina  

* gcc.target/arm/sdiv_costs_1.c: New.

Re: [PATCH GCC][5/5]Enable tree loop distribution at -O3 and above optimization levels.

2017-06-07 Thread Richard Biener

On Wed, Jun 7, 2017 at 10:49 AM, Bin.Cheng  wrote:
> On Wed, Jun 7, 2017 at 9:33 AM, Richard Biener
>  wrote:
>> On Wed, Jun 7, 2017 at 10:07 AM, Bin.Cheng  wrote:
>>> On Tue, Jun 6, 2017 at 6:47 PM, Jeff Law  wrote:
 On 06/02/2017 05:52 AM, Bin Cheng wrote:
> Hi,
> This patch enables -ftree-loop-distribution by default at -O3 and above 
> optimization levels.
> Bootstrap and test at O2/O3 on x86_64 and AArch64.  is it OK?
>
> Note I don't have strong opinion here and am fine with either it's 
> accepted or rejected.
>
> Thanks,
> bin
> 2017-05-31  Bin Cheng  
>
>   * opts.c (default_options_table): Enable OPT_ftree_loop_distribution
>   for -O3 and above levels.
 I think the question is how does this generally impact the performance
 of the generated code and to a lesser degree compile-time.

 Do you have any performance data?
>>> Hi Jeff,
>>> At this stage of the patch, only hmmer is impacted and improved
>>> obviously in my local run of spec2006 for x86_64 and AArch64.  In long
>>> term, loop distribution is also one prerequisite transformation to
>>> handle bwaves (at least).  For these two impacted cases, it helps to
>>> resolve the gap against ICC.  I didn't check compilation time slow
>>> down, we can restrict it to problem with small partition number if
>>> that's a problem.
>>
>> The source of extra compile-time will be dependence checking which
>> is quadratic, there is currently no limit in place on (# writes * (#
>> reads + # writes))
>> but one could easily be added.
> Ah yes, the patch moves dependence computation before partition
> construction now.  More likely this is the bottleneck now.

Ah, that's bad (didn't look at the patch yet).  The idea of the current was
that applying any cost based merging reduces the number of checks that
need to be done.

Do you absolutely need to perform dependence checking upfront?

Richard.

>>
>> Note that I recently added -fopt-info support for loop distribution so
>> it should be
>> possible to get an idea how many loops in SPEC are distributed and if small,
>> double-check them.
> During development, quite a lot loops get distributed.  I checked some
> of them and restricted the pass to not distribute cases with no good.
> But I didn't check with the final version patch.
>>
>> The cost model at this point is very conservative but due to
>> implementation details
>> distributing a loop can cause quite some arithmetic to be duplicated like for
>>
>> int a[1024], b[1204];
>>
>> void foo()
>> {
>>   for (int i = 0; i < 1024; ++i)
>> {
>>a[i] = i * i * i ... * i;
>>b[i] = a[i];
>> }
>> }
>>
>> it will distribute to two loops both computing i * i * i  rather than
>> reading from a[i] in the second loop.
> Hmm, this patch no longer distributes this case.  I think it is more
> conservative than the original model, for example, the ldist tests
> changed are now not distributed because there is no good to do it.
>
> Thanks,
> bin
>>
>> Richard.
>>
>>> Thanks,
>>> bin

 jeff

[PATCH][GRAPHITE] Fix PR79483

2017-06-07 Thread Richard Biener


When the order of visiting dominator children in domwalk changed
GRAPHITE falls foul of relying on a particular order BBs are visited
when computing the original schedule from the vector of pbbs.

The following restores an order that I believe might work.

In the end the original schedule should probably be computed
not relying on the order of pbbs in the pbb array but by
visiting the SESE region in an edge walk that "works"
(build_original_schedule).  We seem to lack a BB -> pbb mapping
though.

So the patch somewhat feels like a hack - not fixing the real
problem in the design of build_original_schedule, but it seems
to work ...

Boostrapped and tested on x86_64-unknown-linux-gnu, ok?

Thanks,
Richard.

2017-06-07  Richard Biener  

PR tree-optimization/79483
* graphite-scop-detection.c (order): New global.
(get_order): Compute bb to order mapping that satisfies code
generation constraints.
(cmp_pbbs): New helper.
(build_scops): Start domwalk at entry block, sort generated
pbbs.

* gcc.dg/graphite/pr79483.c: New testcase.

Index: gcc/graphite-scop-detection.c
===
--- gcc/graphite-scop-detection.c   (revision 248914)
+++ gcc/graphite-scop-detection.c   (working copy)
@@ -1999,6 +1999,46 @@ gather_bbs::after_dom_children (basic_bl
 }
 }
 
+
+/* Compute sth like an execution order, dominator order with first executing
+   edges that stay inside the current loop, delaying processing exit edges.  */
+
+static vec order;
+
+static void
+get_order (scop_p scop, basic_block bb, vec *order, unsigned 
*dfs_num)
+{
+  if (! bb_in_sese_p (bb, scop->scop_info->region))
+return;
+
+  (*order)[bb->index] = (*dfs_num)++;
+  for (basic_block son = first_dom_son (CDI_DOMINATORS, bb);
+   son;
+   son = next_dom_son (CDI_DOMINATORS, son))
+if (flow_bb_inside_loop_p (bb->loop_father, son))
+  get_order (scop, son, order, dfs_num);
+  for (basic_block son = first_dom_son (CDI_DOMINATORS, bb);
+   son;
+   son = next_dom_son (CDI_DOMINATORS, son))
+if (! flow_bb_inside_loop_p (bb->loop_father, son))
+  get_order (scop, son, order, dfs_num);
+}
+
+/* Helper for qsort, sorting after order above.  */
+
+static int
+cmp_pbbs (const void *pa, const void *pb)
+{
+  poly_bb_p bb1 = *((const poly_bb_p *)pa);
+  poly_bb_p bb2 = *((const poly_bb_p *)pb);
+  if (order[bb1->black_box->bb->index] < order[bb2->black_box->bb->index])
+return -1;
+  else if (order[bb1->black_box->bb->index] > order[bb2->black_box->bb->index])
+return 1;
+  else
+return 0;
+}
+
 /* Find Static Control Parts (SCoP) in the current function and pushes
them to SCOPS.  */
 
@@ -2022,7 +2062,18 @@ build_scops (vec *scops)
   scop_p scop = new_scop (s->entry, s->exit);
 
   /* Record all basic blocks and their conditions in REGION.  */
-  gather_bbs (CDI_DOMINATORS, scop).walk (cfun->cfg->x_entry_block_ptr);
+  gather_bbs (CDI_DOMINATORS, scop).walk (s->entry->dest);
+
+  /* domwalk does not fulfil our code-generations constraints on the
+ order of pbb which is to produce sth like execution order, delaying
+exection of loop exit edges.  So compute such order and sort after
+that.  */
+  order.create (last_basic_block_for_fn (cfun));
+  order.quick_grow (last_basic_block_for_fn (cfun));
+  unsigned dfs_num = 0;
+  get_order (scop, s->entry->dest, , _num);
+  scop->pbbs.qsort (cmp_pbbs);
+  order.release ();
 
   build_alias_set (scop);
 
Index: gcc/testsuite/gcc.dg/graphite/pr79483.c
===
--- gcc/testsuite/gcc.dg/graphite/pr79483.c (nonexistent)
+++ gcc/testsuite/gcc.dg/graphite/pr79483.c (working copy)
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fgraphite-identity" } */
+
+int *a;
+extern int b[];
+int c;
+void d ()
+{
+  double e[2][3] = {0.0, 0.0, 1.0};
+  for (int f = 0; f < 2; ++f)
+for (int g = 0; g < 6; ++g)
+  b[0] = a[g] * e[f][2];
+  c = b[0];
+}

[PATCH] Another testsuite fallout of PR80928

2017-06-07 Thread Richard Biener


2017-06-07  Richard Biener  

PR tree-optimization/80928
* gcc.dg/vect/slp-13.c: Adjust patterns with vect_pack_trunc.
* gcc.dg/vect/slp-13-big-array.c: Likewise.

Index: gcc/testsuite/gcc.dg/vect/slp-13.c
===
--- gcc/testsuite/gcc.dg/vect/slp-13.c  (revision 248948)
+++ gcc/testsuite/gcc.dg/vect/slp-13.c  (working copy)
@@ -125,5 +125,7 @@ int main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { 
vect_interleave && vect_extract_even_odd } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } 
} */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { 
{ vect_interleave && vect_extract_even_odd } && { ! vect_pack_trunc } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target { ! vect_pack_trunc } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { 
{ vect_interleave && vect_extract_even_odd } && vect_pack_trunc } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { 
target vect_pack_trunc } } } */
Index: gcc/testsuite/gcc.dg/vect/slp-13-big-array.c
===
--- gcc/testsuite/gcc.dg/vect/slp-13-big-array.c(revision 248948)
+++ gcc/testsuite/gcc.dg/vect/slp-13-big-array.c(working copy)
@@ -133,5 +133,7 @@ int main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { 
vect_interleave && vect_extract_even_odd } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } 
} */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { 
{ vect_interleave && vect_extract_even_odd } && { ! vect_pack_trunc } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target { ! vect_pack_trunc } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { 
{ vect_interleave && vect_extract_even_odd } && vect_pack_trunc } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { 
target vect_pack_trunc } } } */

Re: [Patch, ARM, testsuite] Add -mfloat-abi=hard to arm_neon_ok

2017-06-07 Thread Christophe Lyon

Hi,


On 2 June 2017 at 16:19, Christophe Lyon  wrote:
> Hi,
>
> I have recently updated the dejagnu version I use for
> cross-testing arm and aarch64 toolchains to 1.6+. One of the side
> effects was mentioned by Jonathan in
> https://gcc.gnu.org/ml/gcc-patches/2017-05/msg01267.html. Since I
> use multilibs to test many configurations, I noticed several
> changes in the results I get.
>
> In particular, on arm-none-linux-gnueabihf with -march=armv5t,
> all the tests that require arm_neon_ok fail to compile because
> they now use -march=armv5t -mfpu=neon -mfloat-abi=softfp
> -march=armv7-a, which leads to a failure to include
> gnu/stubs-soft.h (not present since the target is
> 'hf'). Previously, -march=armv5t was appended, making the tests
> unsupported because -mfpu=neon conflicts with -march=armv5t. Now,
> arm_neon_ok succeeds because it only checks if some preprocessor
> defines are present.
>
> This patch fixes that by including arm_neon.h in arm_neon_ok, such
> that it fails for unsupported cases. However, since most of these
> tests should pass instead of becoming unsupported, I have added flag
> combinations with -mfloat-abi=hard.
>
> However, this is not sufficient to make the
> gcc.target/arm/lto/pr65837* tests pass: they do not require
> arm_neon_ok, and when I tried to add it, they still failed
> because these lto tests do not support dg-add-options. My
> proposal is to add a new
> check_effective_target_arm_neon_ok_no_float_abi function which
> tries to use neon without trying to change the -mfloat-abi
> setting (that is, the same as arm_neon_ok, with only ""
> and "-mfpu=neon" in the list of flags) . This makes these two lto
> tests unsupported for non-hf targets (again because
> gnu/stubs-soft.h is not present).
>
> To make them pass on "hf" targets:
> - I added -mfpu=neon to dg-lto-options in pr65837-attr_0.c,
>   because the fpu attributes in arm_neon.h only work if
>   -mfpu=neon is enabled
> - I removed dg-suppress-ld-options {-mfpu=neon} from pr65837_0.c,
>   -mfpu=neon is needed for the test to compile with toolchains
>   where the default fpu is not neon (eg vfpv3-d16-fp16)
>
> On arm-none-linux-gnueabihf --with-cpu=cortex-a9 --with-fpu=vfp
> and multilib test flag=-march=armv5t, this patch brings:
> - 2 UNRESOLVED -> FAIL (gcc.dg/vect/vect-align-1.c)
> - 14 UNRESOLVED -> XPASS (in gcc.dg/vect/)
> - 2765 new PASS
> - 3639 FAIL -> PASS
> - 1826 UNRESOLVED -> PASS
> - 102 UNRESOLVED -> XFAIL
>
> as visible in the red cell at
> http://people.linaro.org/~christophe.lyon/cross-validation/gcc-test-patches/248552-gnu-stubs9.patch/report-build-info.html
> (the build-failed line can be ignored, it was caused by a server
> problem)
>
> Sorry, the explanation is almost longer than the patch :-)
>
> Is it OK for trunk?
> (Just realizing that I forgot to document the new functions :( )
>

Here is an updated version with a bit of documentation for the new
effective target.
arm_neon_ok_no_float_abi now only tries to add -mfpu=neon, not ""
since we always
add -mfpu=neon in the lto tests anyway.

OK?


> Thanks,
>
> Christophe
2017-06-02  Christophe Lyon  

gcc/testsuite/
* lib/target-supports.exp
(check_effective_target_arm_neon_ok_nocache): Add flags with
-mfloat-abi=hard. Include arm_neon.h.
(check_effective_target_arm_neon_ok_no_float_abi_nocache): New.
(check_effective_target_arm_neon_ok_no_float_abi): New.
* gcc.target/arm/lto/pr65837_0.c: Require
arm_neon_ok_no_float_abi. Add -mfpu=neon to dg-lto-options.
* gcc.target/arm/lto/pr65837-attr_0.c: Require
arm_neon_ok_no_float_abi. Remove dg-suppress-ld-options.

gcc/
* doc/sourcebuild.texi (ARM-specific attributes): Document new
arm_neon_ok_no_float_abi effective target.
diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
index bb5b6b9..17d5627 100644
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -1582,6 +1582,12 @@ Test system supports executing NEON v2 instructions.
 ARM Target supports @code{-mfpu=neon -mfloat-abi=softfp} or compatible
 options.  Some multilibs may be incompatible with these options.
 
+@item arm_neon_ok_no_float_abi
+@anchor{arm_neon_ok_no_float_abi}
+ARM Target supports NEON with @code{-mfpu=neon}, but without any
+-mfloat-abi= option.  Some multilibs may be incompatible with this
+option.
+
 @item arm_neonv2_ok
 @anchor{arm_neonv2_ok}
 ARM Target supports @code{-mfpu=neon-vfpv4 -mfloat-abi=softfp} or compatible
diff --git a/gcc/testsuite/gcc.target/arm/lto/pr65837-attr_0.c 
b/gcc/testsuite/gcc.target/arm/lto/pr65837-attr_0.c
index ebc5f44..f00480b 100644
--- a/gcc/testsuite/gcc.target/arm/lto/pr65837-attr_0.c
+++ b/gcc/testsuite/gcc.target/arm/lto/pr65837-attr_0.c
@@ -1,6 +1,7 @@
 /* { dg-lto-do run } */
 /* { dg-require-effective-target arm_neon_hw } */
-/* { dg-lto-options {{-flto}} } */
+/* { dg-require-effective-target

[PATCH] Fix PR80928 fallout

2017-06-07 Thread Richard Biener


The trivial one at least.  Committed.

Richard.

2017-06-07  Richard Biener  

PR tree-optimization/80928
* gcc.dg/vect/slp-perm-8.c: Avoid vectorizing loop computing
check_results.

Index: gcc/testsuite/gcc.dg/vect/slp-perm-8.c
===
--- gcc/testsuite/gcc.dg/vect/slp-perm-8.c  (revision 248919)
+++ gcc/testsuite/gcc.dg/vect/slp-perm-8.c  (working copy)
@@ -41,6 +41,7 @@ int main (int argc, const char* argv[])
   check_results[3*i] = 9 * i + 6;
   check_results[3*i+1] = 9 * i + 15;
   check_results[3*i+2] = 9 * i + 4;
+  __asm__ volatile ("" : : : "memory");
 }
 
   foo (input, output);

Re: [PATCH] gcov: Add block_info::block_info (PR gcov-profile/80911).

2017-06-07 Thread Tom de Vries


On 05/30/2017 11:41 AM, Martin Liška wrote:

Hello.

Following patch adds default constructor that initializes all fields in 
block_info.



Which fixes the 77 corrupted gcno regressions that currently show up 
when compiling trunk with g++ 4.4.3.



Patch can bootstrap on ppc64le-redhat-linux and survives regression tests.

Ready to be installed?


Ping.

Thanks,
- Tom


Martin


0001-gcov-Add-block_info-block_info-PR-gcov-profile-80911.patch


 From b4f7a624b25c7cf3ed3ccc6d59fb68d1dabb2873 Mon Sep 17 00:00:00 2001
From: marxin 
Date: Mon, 29 May 2017 14:00:09 +0200
Subject: [PATCH] gcov: Add block_info::block_info (PR gcov-profile/80911).

gcc/ChangeLog:

2017-05-29  Martin Liska  

PR gcov-profile/80911
* gcov.c (block_info::block_info): New constructor.
---
  gcc/gcov.c | 11 +++
  1 file changed, 11 insertions(+)

diff --git a/gcc/gcov.c b/gcc/gcov.c
index a5aa4aadcac..e324cadad82 100644
--- a/gcc/gcov.c
+++ b/gcc/gcov.c
@@ -132,6 +132,9 @@ struct block_location_info
  
  typedef struct block_info

  {
+  /* Constructor.  */
+  block_info ();
+
/* Chain of exit and entry arcs.  */
arc_t *succ;
arc_t *pred;
@@ -173,6 +176,14 @@ typedef struct block_info
  
  } block_t;
  
+block_info::block_info (): succ (NULL), pred (NULL), num_succ (0), num_pred (0),

+  id (0), count (0), count_valid (0), valid_chain (0), invalid_chain (0),
+  exceptional (0), is_call_site (0), is_call_return (0), is_nonlocal_return 
(0),
+  locations (), chain (NULL)
+{
+  cycle.arc = NULL;
+}
+
  /* Describes a single function. Contains an array of basic blocks.  */
  
  typedef struct function_info

[C++ PATCH] Fix genericization ICE (PR c++/80984)

2017-06-07 Thread Jakub Jelinek

Hi!

As the testcase shows, BLOCK_VARS of the outermost scope can contain
decls other than VAR_DECL that have DECL_NAME identical to what we are
looking for, in this case a LABEL_DECL.  The code is looking for the
VAR_DECL that has been NRV optimized, and while e.g. DECL_HAS_VALUE_EXPR_P
is allowed on all of VAR_DECL, PARM_DECL and RESULT_DECL, BLOCK_VARS
should not contain the latter two.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk and
release branches?

2017-06-07  Jakub Jelinek  

PR c++/80984
* cp-gimplify.c (cp_genericize): Only look for VAR_DECLs in
BLOCK_VARS (outer) chain.
(cxx_omp_const_qual_no_mutable): Likewise.

* g++.dg/opt/nrv18.C: New test.

--- gcc/cp/cp-gimplify.c.jj 2017-05-22 20:49:04.0 +0200
+++ gcc/cp/cp-gimplify.c2017-06-06 10:11:22.780850949 +0200
@@ -1590,7 +1590,8 @@ cp_genericize (tree fndecl)
 
  if (outer)
for (var = BLOCK_VARS (outer); var; var = DECL_CHAIN (var))
- if (DECL_NAME (t) == DECL_NAME (var)
+ if (VAR_P (var)
+ && DECL_NAME (t) == DECL_NAME (var)
  && DECL_HAS_VALUE_EXPR_P (var)
  && DECL_VALUE_EXPR (var) == t)
{
@@ -1837,7 +1838,8 @@ cxx_omp_const_qual_no_mutable (tree decl
 
  if (outer)
for (var = BLOCK_VARS (outer); var; var = DECL_CHAIN (var))
- if (DECL_NAME (decl) == DECL_NAME (var)
+ if (VAR_P (var)
+ && DECL_NAME (decl) == DECL_NAME (var)
  && (TYPE_MAIN_VARIANT (type)
  == TYPE_MAIN_VARIANT (TREE_TYPE (var
{
--- gcc/testsuite/g++.dg/opt/nrv18.C.jj 2017-06-06 10:13:00.925650648 +0200
+++ gcc/testsuite/g++.dg/opt/nrv18.C2017-06-06 10:12:10.670265267 +0200
@@ -0,0 +1,12 @@
+// PR c++/80984
+// { dg-do compile }
+
+struct A { ~A (); };
+
+A
+foo ()
+{
+  A a;
+a:
+  return a;
+}

Jakub

Re: [PATCH GCC][5/5]Enable tree loop distribution at -O3 and above optimization levels.

2017-06-07 Thread Bin.Cheng

On Wed, Jun 7, 2017 at 9:33 AM, Richard Biener
 wrote:
> On Wed, Jun 7, 2017 at 10:07 AM, Bin.Cheng  wrote:
>> On Tue, Jun 6, 2017 at 6:47 PM, Jeff Law  wrote:
>>> On 06/02/2017 05:52 AM, Bin Cheng wrote:
 Hi,
 This patch enables -ftree-loop-distribution by default at -O3 and above 
 optimization levels.
 Bootstrap and test at O2/O3 on x86_64 and AArch64.  is it OK?

 Note I don't have strong opinion here and am fine with either it's 
 accepted or rejected.

 Thanks,
 bin
 2017-05-31  Bin Cheng  

   * opts.c (default_options_table): Enable OPT_ftree_loop_distribution
   for -O3 and above levels.
>>> I think the question is how does this generally impact the performance
>>> of the generated code and to a lesser degree compile-time.
>>>
>>> Do you have any performance data?
>> Hi Jeff,
>> At this stage of the patch, only hmmer is impacted and improved
>> obviously in my local run of spec2006 for x86_64 and AArch64.  In long
>> term, loop distribution is also one prerequisite transformation to
>> handle bwaves (at least).  For these two impacted cases, it helps to
>> resolve the gap against ICC.  I didn't check compilation time slow
>> down, we can restrict it to problem with small partition number if
>> that's a problem.
>
> The source of extra compile-time will be dependence checking which
> is quadratic, there is currently no limit in place on (# writes * (#
> reads + # writes))
> but one could easily be added.
Ah yes, the patch moves dependence computation before partition
construction now.  More likely this is the bottleneck now.

>
> Note that I recently added -fopt-info support for loop distribution so
> it should be
> possible to get an idea how many loops in SPEC are distributed and if small,
> double-check them.
During development, quite a lot loops get distributed.  I checked some
of them and restricted the pass to not distribute cases with no good.
But I didn't check with the final version patch.
>
> The cost model at this point is very conservative but due to
> implementation details
> distributing a loop can cause quite some arithmetic to be duplicated like for
>
> int a[1024], b[1204];
>
> void foo()
> {
>   for (int i = 0; i < 1024; ++i)
> {
>a[i] = i * i * i ... * i;
>b[i] = a[i];
> }
> }
>
> it will distribute to two loops both computing i * i * i  rather than
> reading from a[i] in the second loop.
Hmm, this patch no longer distributes this case.  I think it is more
conservative than the original model, for example, the ldist tests
changed are now not distributed because there is no good to do it.

Thanks,
bin
>
> Richard.
>
>> Thanks,
>> bin
>>>
>>> jeff
>>>

Re: [PATCH GCC][1/5]Factor out interface checking if runtime alias check is possible

2017-06-07 Thread Richard Biener

On Fri, Jun 2, 2017 at 1:50 PM, Bin Cheng  wrote:
> Hi,
> This is the first patch in the series improving tree loop distribution.  It 
> factors out an
> interface checking if runtime alias check is possible in order to resolve 
> data dependence.
> This interface is used in both vectorizer and loop distribution to filter out 
> data dependence
> inappropriate for runtime alias check.
>
> Bootstrap and test at O2/O3 on x86_64 and AArch64.  is it OK?

Ok.

Richard.

> Thanks,
> bin
> 2017-05-31  Bin Cheng  
>
> * tree-vect-data-refs.c (vect_mark_for_runtime_alias_test): Factor
> out code checking if runtime alias check is possible to below ...
> Call the new function.
> * tree-data-ref.c (runtime_alias_check_p): ... to new function.
> * tree-data-ref.h (runtime_alias_check_p): New decalaration.

Re: [PATCH GCC][2/5]Extend graph data structure

2017-06-07 Thread Richard Biener

On Fri, Jun 2, 2017 at 1:51 PM, Bin Cheng  wrote:
> Hi,
> This patch extends graph data structure in two ways:
>   1) Passes private data to callback function of for_each_edge.
>   2) Adds new callback function to graph traversing functions like 
> graphds_scc and graphds_dfs.
>   The callback function acts as a supplement constraint for edges on top 
> of subgraph constraint.
>   With this change, the traversing function not only skips vertices/edges 
> not belong to subgraph,
>   but also skips edges when the callback function returns true on it.  As 
> a result, pass like loop
>   distribution can traverse dependence graph with some dependence edges 
> skipped.
>
> Bootstrap and test at O2/O3 on x86_64 and AArch64.  is it OK?

Ok.

Richard.

> Thanks,
> bin
> 2017-05-31  Bin Cheng  
>
> * graphds.c (add_edge): Intitialize edge's attached data.
> (foll_in_subgraph, dfs_fst_edge, dfs_next_edge): New function
> pointer parameter.  Call pointed function on each edge during
> graph traversing.  Skip traversing the edge when the function
> returns true.
> (graphds_dfs, graphds_scc): Ditto.
> (for_each_edge): New parameter.  Pass the new parameter to callback
> function.
> * graphds.h (skip_edge_callback): New function pointer type.
> (graphds_dfs, graphds_scc): New function pointer parameter.
> (graphds_edge_callback, for_each_edge): New parameter.

Re: [PATCH GCC][3/5]Move pass ivcanon upward in compilation process

2017-06-07 Thread Richard Biener

On Fri, Jun 2, 2017 at 1:51 PM, Bin Cheng  wrote:
> Hi,
> This patch moves pass ivcanon before loop distribution.  Pass loop split 
> could create
> loops with limited niters.  Such loop should be unrolled before loop 
> distribution (or graphite),
> rather than after.
>
> Bootstrap and test at O2/O3 on x86_64 and AArch64.  is it OK?

Ok.

Richard.

> Thanks,
> bin
> 2017-05-31  Bin Cheng  
>
> * passes.def (pass_iv_canon): Move before pass_loop_distribution.

Re: [PATCH GCC][5/5]Enable tree loop distribution at -O3 and above optimization levels.

2017-06-07 Thread Richard Biener

On Wed, Jun 7, 2017 at 10:07 AM, Bin.Cheng  wrote:
> On Tue, Jun 6, 2017 at 6:47 PM, Jeff Law  wrote:
>> On 06/02/2017 05:52 AM, Bin Cheng wrote:
>>> Hi,
>>> This patch enables -ftree-loop-distribution by default at -O3 and above 
>>> optimization levels.
>>> Bootstrap and test at O2/O3 on x86_64 and AArch64.  is it OK?
>>>
>>> Note I don't have strong opinion here and am fine with either it's accepted 
>>> or rejected.
>>>
>>> Thanks,
>>> bin
>>> 2017-05-31  Bin Cheng  
>>>
>>>   * opts.c (default_options_table): Enable OPT_ftree_loop_distribution
>>>   for -O3 and above levels.
>> I think the question is how does this generally impact the performance
>> of the generated code and to a lesser degree compile-time.
>>
>> Do you have any performance data?
> Hi Jeff,
> At this stage of the patch, only hmmer is impacted and improved
> obviously in my local run of spec2006 for x86_64 and AArch64.  In long
> term, loop distribution is also one prerequisite transformation to
> handle bwaves (at least).  For these two impacted cases, it helps to
> resolve the gap against ICC.  I didn't check compilation time slow
> down, we can restrict it to problem with small partition number if
> that's a problem.

The source of extra compile-time will be dependence checking which
is quadratic, there is currently no limit in place on (# writes * (#
reads + # writes))
but one could easily be added.

Note that I recently added -fopt-info support for loop distribution so
it should be
possible to get an idea how many loops in SPEC are distributed and if small,
double-check them.

The cost model at this point is very conservative but due to
implementation details
distributing a loop can cause quite some arithmetic to be duplicated like for

int a[1024], b[1204];

void foo()
{
  for (int i = 0; i < 1024; ++i)
{
   a[i] = i * i * i ... * i;
   b[i] = a[i];
}
}

it will distribute to two loops both computing i * i * i  rather than
reading from a[i] in the second loop.

Richard.

> Thanks,
> bin
>>
>> jeff
>>

Re: [PATCH] handle bzero/bcopy in DSE and aliasing (PR 80933, 80934)

2017-06-07 Thread Richard Biener

On Wed, Jun 7, 2017 at 5:26 AM, Martin Sebor  wrote:
>> Note I'd be _much_ more sympathetic to simply canonicalizing all of
>> bzero and bcopy
>> to memset / memmove and be done with all the above complexity.
>
>
> Attached is an updated patch along these lines.  Please let me
> know if it matches your expectations.

I think you attached the wrong patch.

Richard.

> FWIW, although I don't feel too strongly about bzero et al. I'm
> not sure that this approach is the right one in general.  It might
> (slightly) simplify GCC itself, but other than the incidental code
> generation improvement, it offers no benefit to users.  In some
> cases, it even degrades user experience by causing GCC issue
> diagnostics that refer to functions that don't appear in the source
> code, such as for:
>
>   char d[1];
>
>   void* f (const void *p)
>   {
> bzero (d, 7);
>   }
>
>   warning: ‘__builtin_memset’ writing 7 bytes into a region of size 1
> overflows the destination [-Wstringop-overflow=]
>
> For some functions like mempcpy it might even worse code overall
> (slower and bigger).
>
> In other cases (like profiling) it loses interesting information.
>
> I think these types of transformations would be justified  f they
> were done based on measurably improved efficiency of the generated
> code, but I'm uneasy about swapping calls to one function for another
> solely because it simplifies the implementation.  Not least because
> it doesn't seem like a viable general approach to simplifying the
> implementation.
>
> Martin
>
> PS I stopped short of simplifying GCC to remove the existing special
> handling of these three built-ins.  If the patch is approved I'm
> willing to do the cleanup in a subsequent pass.

Re: [PATCH][2/2] early LTO debug, main part

2017-06-07 Thread Richard Biener

On Fri, 19 May 2017, Richard Biener wrote:

> 
> This is a repost of the main part of the early LTO debug support.
> The only changes relative to the last post is in the dwarf2out.c
> pieces due to Jasons review and Jakubs introduction of
> DW_OP_GNU_variable_value.
> 
> I've also adjusted testcases for fallout (the asan backtraces do
> give files / line numbers because libbacktrace doesn't understand
> the DWARF) plus added a -flto run over the libstdc++ pretty printer
> testsuite -- after all the goal was to make those work with LTO,
> and they now nicely do.
> 
> [LTO-]bootstrapped and tested on x86_64-unknown-linux-gnu.
> 
> I've also tested with -flto -g and compared to before the patch and
> the outcome doesn't contain any surprises.
> 
> I've also ran the gdb testsuite with no differences (but I guess
> it doesn't exercise LTO).
> 
> I've also built SPEC 2k6 with -flto -g.
> 
> I've also debugged optimized LTO bootstrapped cc1 a bit - not that
> debugging (LTO) optimized cc1 is a pleasant experience, but at least
> gdb doesn't crash.
> 
> Ok for trunk?

Ping.

> Both darwin and mingw maintainers were not concerned about LTO with -g
> being broken for them.
> 
> This patch allows us to go forward with freeing more stuff after
> the frontend finished, in particular remove LTO streaming of a lot
> of type information that is referenced from trees (and, as a first
> step, enable free-lang-data for non-LTO compiles).
> 
> Thanks,
> Richard.
> 
> 2017-05-19  Richard Biener  
> 
> * debug.h (struct gcc_debug_hooks): Add die_ref_for_decl and
> register_external_die hooks.
> (debug_false_tree_charstarstar_uhwistar): Declare.
> (debug_nothing_tree_charstar_uhwi): Likewise.
> * debug.c (do_nothing_debug_hooks): Adjust.
> (debug_false_tree_charstarstar_uhwistar): New do nothing.
> (debug_nothing_tree_charstar_uhwi): Likewise.
> * dbxout.c (dbx_debug_hooks): Adjust.
> (xcoff_debug_hooks): Likewise.
> * sdbout.c (sdb_debug_hooks): Likewise.
> * vmsdbgout.c (vmsdbg_debug_hooks): Likewise.
> 
> * dwarf2out.c (macinfo_label_base): New global.
>   (dwarf2out_register_external_die): New function for the
>   register_external_die hook.
> (dwarf2out_die_ref_for_decl): Likewise for die_ref_for_decl.
> (dwarf2_debug_hooks): Use them.
> (dwarf2_lineno_debug_hooks): Adjust.
> (struct die_struct): Add with_offset flag.
> (DEBUG_LTO_DWO_INFO_SECTION, DEBUG_LTO_INFO_SECTION,
> DEBUG_LTO_DWO_ABBREV_SECTION, DEBUG_LTO_ABBREV_SECTION,
> DEBUG_LTO_DWO_MACINFO_SECTION, DEBUG_LTO_MACINFO_SECTION,
> DEBUG_LTO_DWO_MACRO_SECTION, DEBUG_LTO_MACRO_SECTION,
> DEBUG_LTO_LINE_SECTION, DEBUG_LTO_DWO_STR_OFFSETS_SECTION,
> DEBUG_LTO_STR_DWO_SECTION, DEBUG_STR_LTO_SECTION): New macros
> defining section names for the early LTO debug variants.
>   (reset_indirect_string): New helper.
> (add_AT_external_die_ref): Helper for dwarf2out_register_external_die.
> (print_dw_val): Add support for offsetted symbol references.
> (compute_section_prefix_1): Split out worker to distinguish
> the comdat from the LTO case.
> (compute_section_prefix): Wrap old comdat case here.
> (output_die): Skip DIE symbol output for the LTO added one.
> Handle DIE symbol references with offset.
> (output_comp_unit): Guard section name mangling properly.
> For LTO debug sections emit a symbol at the section beginning
> which we use to refer to its DIEs.
> (add_abstract_origin_attribute): For DIEs registered via
> dwarf2out_register_external_die directly refer to the early
> DIE rather than indirectly through the shadow one we created.
> (gen_array_type_die): When generating early LTO debug do
> not emit DW_AT_string_length.
> (gen_formal_parameter_die): Do not re-create DIEs for PARM_DECLs
> late when in LTO.
> (gen_subprogram_die): Adjust the check for whether we face
> a concrete instance DIE for an inline we can reuse for the
> late LTO case.  Likewise avoid another specification DIE
> for early built declarations/definitions for the late LTO case.
> (gen_variable_die): Add type references for late duplicated VLA dies
> when in late LTO.
> (gen_inlined_subroutine_die): Do not call dwarf2out_abstract_function,
> we have the abstract instance already.
> (process_scope_var): Adjust decl DIE contexts in LTO which
> first puts them in limbo.
> (gen_decl_die): Do not generate type DIEs late apart from
> types for VLAs or for decls we do not yet have a DIE.
> (dwarf2out_early_global_decl): Make sure to create DIEs
> for abstract instances of a decl first.
> (dwarf2out_late_global_decl): Adjust comment.
>

Re: [PATCH][1/2] Early LTO debug, simple-object part

2017-06-07 Thread Richard Biener

On Fri, 19 May 2017, Richard Biener wrote:

> 
> This is a repost (unchanged) of the simple-object ELF support for
> early LTO debug transfer from IL object to a separate debug-only object 
> file.
> 
> Bootstrapped and tested on x86_64-unknown-linux-gnu.

Ping.

> Richard.
> 
> 2017-05-19  Richard Biener  
> 
>   include/
>   * simple-object.h (simple_object_copy_lto_debug_sections): New
>   function.
> 
>   libiberty/
>   * simple-object-common.h (struct simple_object_functions): Add
>   copy_lto_debug_sections hook.
>   * simple-object.c: Include fcntl.h.
>   (handle_lto_debug_sections): New helper function.
>   (simple_object_copy_lto_debug_sections): New function copying
>   early LTO debug sections to regular debug sections in a new file.
>   (simple_object_start_write): Handle NULL segment_name.
>   * simple-object-coff.c (simple_object_coff_functions): Adjust
>   for not implemented copy_lto_debug_sections hook.
>   * simple-object-mach-o.c (simple_object_mach_o_functions): Likewise.
>   * simple-object-xcoff.c (simple_object_xcoff_functions): Likewise.
>   * simple-object-elf.c (SHT_NULL, SHT_SYMTAB, SHT_RELA, SHT_REL,
>   SHT_GROUP): Add various sectopn header types.
>   (SHF_EXCLUDE): Add flag.
>   (Elf32_External_Sym, Elf64_External_Sym): Add symbol struct.
>   (ELF_ST_BIND, ELF_ST_TYPE, ELF_ST_INFO): Add accessors.
>   (STT_OBJECT, STT_FUNC, STT_TLS, STT_GNU_IFUNC): Add Symbol types.
>   (STV_DEFAULT): Add symbol visibility.
>   (SHN_COMMON): Add special section index name.
>   (struct simple_object_elf_write): New.
>   (simple_object_elf_start_write): Adjust for new private data.
>   (simple_object_elf_write_shdr): Pass in values for all fields
>   we write.
>   (simple_object_elf_write_to_file): Adjust.  Copy from recorded
>   section headers if requested.
>   (simple_object_elf_release_write): Release private data.
>   (simple_object_elf_copy_lto_debug_sections): Copy and rename sections
>   as denoted by PFN and all their dependences, symbols and relocations
>   to the empty destination file.
>   (simple_object_elf_functions): Adjust for copy_lto_debug_sections hook.
> 
> Index: early-lto-debug/include/simple-object.h
> ===
> --- early-lto-debug.orig/include/simple-object.h  2016-10-19 
> 13:19:58.012326431 +0200
> +++ early-lto-debug/include/simple-object.h   2016-10-20 10:51:49.861722998 
> +0200
> @@ -197,6 +197,14 @@ simple_object_write_to_file (simple_obje
>  extern void
>  simple_object_release_write (simple_object_write *);
>  
> +/* Copy LTO debug sections from SRC_OBJECT to DEST.
> +   If an error occurs, return the errno value in ERR and an error string.  */
> +
> +extern const char *
> +simple_object_copy_lto_debug_sections (simple_object_read *src_object,
> +const char *dest,
> +int *err);
> +
>  #ifdef __cplusplus
>  }
>  #endif
> Index: early-lto-debug/libiberty/simple-object-common.h
> ===
> --- early-lto-debug.orig/libiberty/simple-object-common.h 2016-10-19 
> 13:19:58.012326431 +0200
> +++ early-lto-debug/libiberty/simple-object-common.h  2016-10-20 
> 10:51:49.865723045 +0200
> @@ -141,6 +141,12 @@ struct simple_object_functions
>  
>/* Release the private data for an simple_object_write.  */
>void (*release_write) (void *);
> +
> +  /* Copy LTO debug sections.  */
> +  const char *(*copy_lto_debug_sections) (simple_object_read *sobj,
> +   simple_object_write *dobj,
> +   int (*pfn) (const char **),
> +   int *err);
>  };
>  
>  /* The known object file formats.  */
> Index: early-lto-debug/libiberty/simple-object-elf.c
> ===
> --- early-lto-debug.orig/libiberty/simple-object-elf.c2016-10-19 
> 13:19:58.012326431 +0200
> +++ early-lto-debug/libiberty/simple-object-elf.c 2016-10-20 
> 10:51:49.865723045 +0200
> @@ -183,8 +183,55 @@ typedef struct {
>  
>  /* Values for sh_type field.  */
>  
> +#define SHT_NULL 0   /* Section header table entry unused */
>  #define SHT_PROGBITS 1   /* Program data */
> +#define SHT_SYMTAB   2   /* Link editing symbol table */
>  #define SHT_STRTAB   3   /* A string table */
> +#define SHT_RELA 4   /* Relocation entries with addends */
> +#define SHT_REL  9   /* Relocation entries, no 
> addends */
> +#define SHT_GROUP17  /* Section contains a section group */
> +
> +/* Values for sh_flags field.  */
> +
> +#define SHF_EXCLUDE  0x8000  /* Link editor is to exclude this
> +

Re: [PATCH GCC][5/5]Enable tree loop distribution at -O3 and above optimization levels.

2017-06-07 Thread Bin.Cheng

On Tue, Jun 6, 2017 at 6:47 PM, Jeff Law  wrote:
> On 06/02/2017 05:52 AM, Bin Cheng wrote:
>> Hi,
>> This patch enables -ftree-loop-distribution by default at -O3 and above 
>> optimization levels.
>> Bootstrap and test at O2/O3 on x86_64 and AArch64.  is it OK?
>>
>> Note I don't have strong opinion here and am fine with either it's accepted 
>> or rejected.
>>
>> Thanks,
>> bin
>> 2017-05-31  Bin Cheng  
>>
>>   * opts.c (default_options_table): Enable OPT_ftree_loop_distribution
>>   for -O3 and above levels.
> I think the question is how does this generally impact the performance
> of the generated code and to a lesser degree compile-time.
>
> Do you have any performance data?
Hi Jeff,
At this stage of the patch, only hmmer is impacted and improved
obviously in my local run of spec2006 for x86_64 and AArch64.  In long
term, loop distribution is also one prerequisite transformation to
handle bwaves (at least).  For these two impacted cases, it helps to
resolve the gap against ICC.  I didn't check compilation time slow
down, we can restrict it to problem with small partition number if
that's a problem.

Thanks,
bin
>
> jeff
>

[PATCH] MIPS16/GCC: Emit bounds checking as RTL in `casesi'

2017-06-07 Thread Maciej W. Rozycki

Hi,

 Further to my changes made last November here is an update to the MIPS16 
`casesi' pattern making it emit bounds checking as RTL rather than having 
it as hardcoded assembly within the `casesi_internal_mips16_' 
dispatcher.  See below for how PR tree-optimization/51513 has prevented me 
from proceeding back then.

 This new arrangement has several advantages:

1. There is no hardcoded BTEQZ branch instruction that has a limited span
   and can overflow causing an assembly failure if the target label is too
   distant.  GCC is able to relax out of range MIPS16 branches these days, 
   but obviously they have to be expressed in RTL rather than buried in 
   assembly code.  This overflow can be easily reproduced; please enquire 
   for a boring example if interested.

2. The `casesi_internal_mips16_' pattern now has an accurate length
   (aka instruction count) recorded as all its remaining emitted assembly 
   instructions are known in advance to fit in their regular (16-bit) 
   machine encodings.  Previously there was uncertainty about the SLTU and
   BTEQZ instructions used for the bounds check, which depending on their 
   exact arguments could have each resulted in their either regular 
   (16-bit) or extended (32-bit) encoding.  Consequently the worst length 
   estimate was recorded instead, possibly causing worse code generation
   (e.g. premature out-of-range branch relaxation or jump table expansion 
   from half-words to full words).

3. GCC now has freedom to schedule code around bounds checking as it sees 
   fit rather than having to adapt to the fixed assembly block.  In fact 
   it tends to make use of it right away swapping BTEQZ for the BTNEZ
   instruction and rescheduling code such that the out-of-bounds (default) 
   case executes linearly.

There are probably more benefits, but these are what has immediately come 
to my mind.

 As noted above I meant to propose this change along with the rest so as 
to have it in GCC 7, however emitting the bounds check as a separate RTL 
pattern triggered an unrelated bug, then unknown to me, causing a fatal 
code generation regression, and the lack of time did not allow me to 
investigate it further.  This was easily reproduced with a piece of code 
(reduced from actual Linux kernel code) like this:

$ cat frob.c
int
frob (int i)
{
  switch (i)
{
case -5:
  return -2;
case -3:
  return -1;
case 0:
  return 0;
case 3:
  return 1;
case 5:
  break;
default:
  __builtin_unreachable ();
}
  return i;
}

producing truncated assembly like this:

$ gcc -O2 -mips16 -mcode-readable=yes -dp -S frob.c
$ cat frob.s
.file   1 "frob.c"
.section .mdebug.abi32
.previous
.nanlegacy
.module fp=32
.module nooddspreg
.abicalls
.option pic0
.text
.align  2
.weak   frob
.setmips16
.setnomicromips
.entfrob
.type   frob, @function
frob:
.frame  $sp,0,$31   # vars= 0, regs= 0/0, args= 0, gp= 0
.mask   0x,0
.fmask  0x,0
addiu   $2,$4,5  # 11   *addsi3_mips16/7[length = 2]
.endfrob
.size   frob, .-frob
.ident  "GCC: (GNU) 7.0.0 20161117 (experimental)"
$ 

-- where as you can see the whole switch statement has vanished along with 
any return path from the function, and only the preparatory addition 
emitted from `casesi' with:

  emit_insn (gen_addsi3 (reg, operands[0], offset));

remained.

 The causing bug has turned out to be what was filed as PR 
tree-optimization/51513 and has been kindly fixed by Peter recently 
(thanks, Peter!) with r247844 ("Fix PR51513, switch statement with default 
case containing __builtin_unreachable leads to wild branch"), 
, enabling me to 
proceed with this change without having to investigate the cause of code 
breakage -- which for the MIPS16 target has clearly turned out to be 
graver than a mere silly branch never to be executed.

 Given the previous troubles with this change I have decided to add
MIPS16 test cases to verify that code truncation has not happened, 
complementing gcc.target/powerpc/pr51513.c, in case further tweaks in this 
area might do something bad.  This would be caught by 
gcc.target/mips/insn-casesi.c added with r242424, but that test case does 
not refer to PR tree-optimization/51513, so let's make it explicit.  With 
the PR tree-optimization/51513 fix removed the two new cases indeed cause:

FAIL: gcc.target/mips/pr51513-1.c   -O2   scan-assembler \tjrc?\t\\$31\n
FAIL: gcc.target/mips/pr51513-1.c   -O3 -g   scan-assembler \tjrc?\t\\$31\n
FAIL: gcc.target/mips/pr51513-1.c   -O2 -flto -fno-use-linker-plugin 
-flto-partition=none   scan-assembler \tjrc?\t\\$31\n
FAIL: gcc.target/mips/pr51513-1.c   -O2 -flto -fuse-linker-plugin 
-fno-fat-lto-objects   scan-assembler

Fix profile updating in cfgbuild

2017-06-07 Thread Jan Hubicka

Hi
the following patch makes cfgbuild to preserve profile when loops are
introduced at RTL level (not very well, but at least do not throw it all
away) and also avoids re-computing probabilities when there are no
changes to CFG.

Bootstrapped/regtested x86_64-linux. Comitted.

Honza

Index: cfgbuild.c
===
--- cfgbuild.c  (revision 248915)
+++ cfgbuild.c  (working copy)
@@ -475,6 +475,10 @@ find_bb_boundaries (basic_block bb)
 
  bb = fallthru->dest;
  remove_edge (fallthru);
+ /* BB is unreachable at this point - we need to determine its profile
+once edges are built.  */
+ bb->frequency = 0;
+ bb->count = profile_count::uninitialized ();
  flow_transfer_insn = NULL;
  if (code == CODE_LABEL && LABEL_ALT_ENTRY_P (insn))
make_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun), bb, 0);
@@ -577,7 +581,7 @@ compute_outgoing_frequencies (basic_bloc
 guess_outgoing_edge_probabilities (b);
 }
 
-  if (b->count > profile_count::zero ())
+  if (b->count.initialized_p ())
 FOR_EACH_EDGE (e, ei, b->succs)
   e->count = b->count.apply_probability (e->probability);
 }
@@ -590,6 +594,9 @@ void
 find_many_sub_basic_blocks (sbitmap blocks)
 {
   basic_block bb, min, max;
+  bool found = false;
+  auto_vec n_succs;
+  n_succs.safe_grow_cleared (last_basic_block_for_fn (cfun));
 
   FOR_EACH_BB_FN (bb, cfun)
 SET_STATE (bb,
@@ -597,11 +604,24 @@ find_many_sub_basic_blocks (sbitmap bloc
 
   FOR_EACH_BB_FN (bb, cfun)
 if (STATE (bb) == BLOCK_TO_SPLIT)
-  find_bb_boundaries (bb);
+  {
+   int n = last_basic_block_for_fn (cfun);
+   unsigned int ns = EDGE_COUNT (bb->succs);
+
+find_bb_boundaries (bb);
+   if (n == last_basic_block_for_fn (cfun) && ns == EDGE_COUNT (bb->succs))
+ n_succs[bb->index] = EDGE_COUNT (bb->succs);
+  }
 
   FOR_EACH_BB_FN (bb, cfun)
 if (STATE (bb) != BLOCK_ORIGINAL)
-  break;
+  {
+   found = true;
+break;
+  }
+
+  if (!found)
+return;
 
   min = max = bb;
   for (; bb != EXIT_BLOCK_PTR_FOR_FN (cfun); bb = bb->next_bb)
@@ -624,14 +644,37 @@ find_many_sub_basic_blocks (sbitmap bloc
  continue;
if (STATE (bb) == BLOCK_NEW)
  {
+   bool initialized_src = false, uninitialized_src = false;
bb->count = profile_count::zero ();
bb->frequency = 0;
FOR_EACH_EDGE (e, ei, bb->preds)
  {
-   bb->count += e->count;
+   if (e->count.initialized_p ())
+ {
+   bb->count += e->count;
+   initialized_src = true;
+ }
+   else
+ uninitialized_src = false;
bb->frequency += EDGE_FREQUENCY (e);
  }
+   /* When some edges are missing with read profile, this is
+  most likely because RTL expansion introduced loop.
+  When profile is guessed we may have BB that is reachable
+  from unlikely path as well as from normal path.
+
+  TODO: We should handle loops created during BB expansion
+  correctly here.  For now we assume all those loop to cycle
+  precisely once.  */
+   if (!initialized_src
+   || (uninitialized_src
+&& profile_status_for_fn (cfun) != PROFILE_READ))
+ bb->count = profile_count::uninitialized ();
  }
+   else
+ /* If nothing changed, there is no need to create new BBs.  */
+ if (EDGE_COUNT (bb->succs) == n_succs[bb->index])
+   continue;
 
compute_outgoing_frequencies (bb);
   }

82 matches

Mail list logo