Re: [PATCH] Builtins handling in IVOPT

2013-11-21 Thread Zdenek Dvorak
Hi,

 This patch works on the intrinsic calls handling issue in IVOPT mentioned 
 here:
 http://gcc.gnu.org/ml/gcc-patches/2010-10/msg01295.html
 
 In find_interesting_uses_stmt, it changes
 
 arg = expr
 __builtin_xxx (arg)
 
 to
 
 arg = expr;
 tmp = addr_expr (mem_ref(arg));
 __builtin_xxx (tmp, ...)

this looks a bit confusing (and wasteful) to me. It would make more sense to
just record the argument as USE_ADDRESS and do the rewriting in 
rewrite_use_address.

Zdenek


Re: [patch,libgfortran] Fix binary128 ERFC_SCALED

2013-11-21 Thread Andreas Schwab
../../../libgfortran/intrinsics/erfc_scaled.c:59:1: error: unknown type name 
'GFC_REAL_16'
 extern GFC_REAL_16 erfc_scaled_r16 (GFC_REAL_16);
 ^
../../../libgfortran/intrinsics/erfc_scaled.c:59:1: warning: parameter names 
(without types) in function declaration [enabled by default]
../../../libgfortran/intrinsics/erfc_scaled.c:63:1: error: unknown type name 
'GFC_REAL_16'
 GFC_REAL_16
 ^
../../../libgfortran/intrinsics/erfc_scaled.c:64:18: error: unknown type name 
'GFC_REAL_16'
 erfc_scaled_r16 (GFC_REAL_16 x)
  ^
../../../libgfortran/intrinsics/erfc_scaled.c:66:3: error: unsupported 
non-standard suffix on floating constant
   if (x  -106.566990228185312813205074546585730Q)
   ^
make[3]: *** [erfc_scaled.lo] Error 1

Andreas.

-- 
Andreas Schwab, SUSE Labs, sch...@suse.de
GPG Key fingerprint = 0196 BAD8 1CE9 1970 F4BE  1748 E4D4 88E3 0EEA B9D7
And now for something completely different.


Re: [SH] PR 53976 - Add RTL pass to eliminate clrt, sett insns

2013-11-21 Thread Oleg Endo
On Thu, 2013-11-21 at 08:06 +0900, Kaz Kojima wrote:
 Oleg Endo oleg.e...@t-online.de wrote:
  * config.gcc (SH extra_objs): Add  sh_optimize_sett_clrt pass.o.
 
 The usual way would be
 
   * config.gcc (sh[123456789lbe]*-*-* | sh-*-*): Add
   sh_optimize_sett_clrt.o to extra_objs.
 
 OK with that change.  Thanks!

Thanks.  While at it, I also fixed the ChangeLog entry for
sh_treg_combine.o.

Cheers,
Oleg



Re: [patch,libgfortran] Fix binary128 ERFC_SCALED

2013-11-21 Thread FX
 ../../../libgfortran/intrinsics/erfc_scaled.c:59:1: error: unknown type name 
 ‘GFC_REAL_16'

I’m really sorry about that, I should have tested on a system without 
binary128, that would have caught it.
Attached patch committed as rev. 205193 after checking it on system both with 
and without binary128.

Sorry again,
FX



2013-11-20  Francois-Xavier Coudert  fxcoud...@gcc.gnu.org

PR libfortran/59227
* intrinsics/erfc_scaled.c (erfc_scaled_r16): Don't define if
__float128 is not available.



fix.diff
Description: Binary data


Re: [PATCH] Builtins handling in IVOPT

2013-11-21 Thread Bin.Cheng
I don't know very much about the problem but willing to study since I
am looking into IVO recently :)

 --- tree-ssa-loop-ivopts.c  (revision 204792)
 +++ tree-ssa-loop-ivopts.c  (working copy)
 @@ -135,6 +135,8 @@ struct iv
tree ssa_name;   /* The ssa name with the value.  */
bool biv_p;  /* Is it a biv?  */
bool have_use_for;   /* Do we already have a use for it?  */
 +  bool builtin_mem_param; /* Used as param of a builtin, so it could not be
 +removed by remove_unused_ivs.  */

As comment below, address parameter may be not limited to builtin
function only, how about a variable name more generic?

unsigned use_id; /* The identifier in the use if it is the case.  */
  };

 @@ -952,6 +954,7 @@ alloc_iv (tree base, tree step)
iv-step = step;
iv-biv_p = false;
iv-have_use_for = false;
 +  iv-builtin_mem_param = false;
iv-use_id = 0;
iv-ssa_name = NULL_TREE;

 @@ -1874,13 +1877,36 @@ find_invariants_stmt (struct ivopts_data
  }
  }

 +/* Find whether the Ith param of the BUILTIN is a mem
 +   reference. If I is -1, it returns whether the BUILTIN
 +   contains any mem reference type param.  */
 +
 +static bool
 +builtin_has_mem_ref_p (gimple builtin, int i)
 +{
 +  tree fndecl = gimple_call_fndecl (builtin);
 +  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
 +{
 +  switch (DECL_FUNCTION_CODE (fndecl))
 +   {
 +   case BUILT_IN_PREFETCH:
 + if (i == -1 || i == 0)
 +return true;
 +   }
This switch looks strange, could be refactored I think.

 +}
 +  else if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
 +return targetm.builtin_has_mem_ref_p ((int) DECL_FUNCTION_CODE
 (fndecl), i);
 +
 +  return false;
 +}
 +
  /* Finds interesting uses of induction variables in the statement STMT.  */

  static void
  find_interesting_uses_stmt (struct ivopts_data *data, gimple stmt)
  {
struct iv *iv;
 -  tree op, *lhs, *rhs;
 +  tree op, *lhs, *rhs, callee;
ssa_op_iter iter;
use_operand_p use_p;
enum tree_code code;
 @@ -1937,6 +1963,74 @@ find_interesting_uses_stmt (struct ivopt

  call (memory).  */
  }
 +  else if (is_gimple_call (stmt)
 +   (callee = gimple_call_fndecl (stmt))
 +   is_builtin_fn (callee)
 +   builtin_has_mem_ref_p (stmt, -1))
 +{

I noticed the preceding comments about call(memory), is your change a
specific case of the mention one?

 +  size_t i;
 +  for (i = 0; i  gimple_call_num_args (stmt); i++)
 +   {
 + if (builtin_has_mem_ref_p (stmt, i))
 +   {
 + gimple def, g;
 + gimple_seq seq = NULL;
 + tree type, mem, addr, rhs;
 + tree *arg = gimple_call_arg_ptr (stmt, i);
 +  location_t loc = gimple_location (stmt);
 + gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 +
 +  if (TREE_CODE (*arg) != SSA_NAME)
 +   continue;
 +
 + def = SSA_NAME_DEF_STMT (*arg);
 + gcc_assert (gimple_code (def) == GIMPLE_PHI
 + || is_gimple_assign (def));
 + /* Suppose we have the case:
 +  arg = expr;
 +  call (arg)
 +If the expr is not like the form: MEM(...), change it to:
 +  arg = expr;
 +  tmp = MEM(arg);
 +  call(tmp);
 +then try to find interesting uses address in MEM(arg).  */
 + if (is_gimple_assign (def)
 +  (rhs = gimple_assign_rhs1(def))
 +  TREE_CODE (rhs) == ADDR_EXPR
 +  REFERENCE_CLASS_P (TREE_OPERAND (rhs, 0)))
 +   {
 + iv = get_iv (data, *arg);
 + if (iv  !iv-builtin_mem_param)
 +   iv-builtin_mem_param = true;
 +
 + find_interesting_uses_address (data, def,
 +TREE_OPERAND (rhs, 0));
 +   }
 + else
 +   {
 + mem = build2 (MEM_REF, TREE_TYPE (*arg), *arg,
 +   build_int_cst (TREE_TYPE (*arg), 0));
 + type = build_pointer_type (TREE_TYPE (*arg));
 + addr = build1 (ADDR_EXPR, type, mem);
 + g = gimple_build_assign_with_ops (ADDR_EXPR,
 +   make_ssa_name (type, 
 NULL),
 +   addr, NULL);
 + gimple_call_set_arg (stmt, i, gimple_assign_lhs (g));
 + update_stmt (stmt);
 + gimple_set_location (g, loc);
 + gimple_seq_add_stmt_without_update (seq, g);
 + gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
 + find_interesting_uses_address (data, g,
 TREE_OPERAND (addr, 0));

This would be the only code changes gimple before iv use 

[PATCH][1/2] Fix PR59058

2013-11-21 Thread Richard Biener

This removes the use of the bogus number_of_exit_cond_executions
function from loop-distribution.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.

Richard.

2013-11-21  Richard Biener  rguent...@suse.de

PR tree-optimization/59058
* tree-loop-distribution.c (struct partition_s): Add plus_one
member.
(build_size_arg_loc): Apply niter adjustment here.
(generate_memset_builtin): Adjust.
(generate_memcpy_builtin): Likewise.
(classify_partition): Do not use number_of_exit_cond_executions
but record whether niter needs to be adjusted.

Index: gcc/tree-loop-distribution.c
===
*** gcc/tree-loop-distribution.c(revision 205118)
--- gcc/tree-loop-distribution.c(working copy)
*** typedef struct partition_s
*** 480,485 
--- 480,486 
data_reference_p main_dr;
data_reference_p secondary_dr;
tree niter;
+   bool plus_one;
  } *partition_t;
  
  
*** generate_loops_for_partition (struct loo
*** 703,715 
  /* Build the size argument for a memory operation call.  */
  
  static tree
! build_size_arg_loc (location_t loc, data_reference_p dr, tree nb_iter)
  {
!   tree size;
!   size = fold_build2_loc (loc, MULT_EXPR, sizetype,
! fold_convert_loc (loc, sizetype, nb_iter),
  TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr;
!   return fold_convert_loc (loc, size_type_node, size);
  }
  
  /* Build an address argument for a memory operation call.  */
--- 704,719 
  /* Build the size argument for a memory operation call.  */
  
  static tree
! build_size_arg_loc (location_t loc, data_reference_p dr, tree nb_iter,
!   bool plus_one)
  {
!   tree size = fold_convert_loc (loc, sizetype, nb_iter);
!   if (plus_one)
! size = size_binop (PLUS_EXPR, size, size_one_node);
!   size = fold_build2_loc (loc, MULT_EXPR, sizetype, size,
  TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr;
!   size = fold_convert_loc (loc, size_type_node, size);
!   return size;
  }
  
  /* Build an address argument for a memory operation call.  */
*** generate_memset_builtin (struct loop *lo
*** 781,787 
/* The new statements will be placed before LOOP.  */
gsi = gsi_last_bb (loop_preheader_edge (loop)-src);
  
!   nb_bytes = build_size_arg_loc (loc, partition-main_dr, partition-niter);
nb_bytes = force_gimple_operand_gsi (gsi, nb_bytes, true, NULL_TREE,
   false, GSI_CONTINUE_LINKING);
mem = build_addr_arg_loc (loc, partition-main_dr, nb_bytes);
--- 785,792 
/* The new statements will be placed before LOOP.  */
gsi = gsi_last_bb (loop_preheader_edge (loop)-src);
  
!   nb_bytes = build_size_arg_loc (loc, partition-main_dr, partition-niter,
!partition-plus_one);
nb_bytes = force_gimple_operand_gsi (gsi, nb_bytes, true, NULL_TREE,
   false, GSI_CONTINUE_LINKING);
mem = build_addr_arg_loc (loc, partition-main_dr, nb_bytes);
*** generate_memcpy_builtin (struct loop *lo
*** 837,843 
/* The new statements will be placed before LOOP.  */
gsi = gsi_last_bb (loop_preheader_edge (loop)-src);
  
!   nb_bytes = build_size_arg_loc (loc, partition-main_dr, partition-niter);
nb_bytes = force_gimple_operand_gsi (gsi, nb_bytes, true, NULL_TREE,
   false, GSI_CONTINUE_LINKING);
dest = build_addr_arg_loc (loc, partition-main_dr, nb_bytes);
--- 842,849 
/* The new statements will be placed before LOOP.  */
gsi = gsi_last_bb (loop_preheader_edge (loop)-src);
  
!   nb_bytes = build_size_arg_loc (loc, partition-main_dr, partition-niter,
!partition-plus_one);
nb_bytes = force_gimple_operand_gsi (gsi, nb_bytes, true, NULL_TREE,
   false, GSI_CONTINUE_LINKING);
dest = build_addr_arg_loc (loc, partition-main_dr, nb_bytes);
*** classify_partition (loop_p loop, struct
*** 980,990 
--- 986,998 
tree nb_iter;
data_reference_p single_load, single_store;
bool volatiles_p = false;
+   bool plus_one = false;
  
partition-kind = PKIND_NORMAL;
partition-main_dr = NULL;
partition-secondary_dr = NULL;
partition-niter = NULL_TREE;
+   partition-plus_one = false;
  
EXECUTE_IF_SET_IN_BITMAP (partition-stmts, 0, i, bi)
  {
*** classify_partition (loop_p loop, struct
*** 1047,1059 
if (!single_store)
  return;
  
!   if (!dominated_by_p (CDI_DOMINATORS, single_exit (loop)-src,
!  gimple_bb (DR_STMT (single_store
! nb_iter = number_of_latch_executions (loop);
!   else
! nb_iter = number_of_exit_cond_executions (loop);
if (!nb_iter || nb_iter == chrec_dont_know)
  return;
  
if 

RE: [PATCH GCC]Improve IVOPT to handle outside and inside loop iv uses differently in GCC

2013-11-21 Thread bin.cheng
Ping and CC Zdenek with the right email.

Thanks,
bin

 -Original Message-
 From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
 ow...@gcc.gnu.org] On Behalf Of bin.cheng
 Sent: Wednesday, November 06, 2013 5:51 PM
 To: gcc-patches@gcc.gnu.org
 Cc: Richard Biener; o...@ucw.cz
 Subject: [PATCH GCC]Improve IVOPT to handle outside and inside loop iv
 uses differently in GCC
 
 Hi,
 GCC IVOPT has a problem that it doesn't differentiate between iv uses
 outside of loop from inside ones.  It computes cost for outside iv use
just like
 inside ones, which is wrong because outside iv use should be computed
 along loop exit edge and the cost should be amortized against loop
iteration
 number.  Lastly, the computation of outside iv use is inserted in loop,
rather
 along loop exit edge.
 
 This is interesting since usually outside iv use should be handled
differently,
 or it hurts optimization in several ways like:
 1) Wrong iv candidate is chosen because of inaccurate cost.
 2) Extra computation in loop itself is redundant.
 3) Extra code computing outside iv use in loop may increases register
 pressure because both iv variables before and after stepping could be
alive
 at same time.
 4) IVOPT generates code that it expects to stay as is, passes like DOM
tends
 to break this because of the extra computation.  This hurts targets with
auto-
 increment support more.
 
 This patch fixes the problem.  Bootstrap and test on x86/x86_64/arm.
 Richard, Zdenek,  does this look reasonable?
 
 Thanks,
 bin
 
 
 gcc/testsuite/ChangeLog
 2013-11-06  Bin Cheng  bin.ch...@arm.com
 
   * gcc.dg/tree-ssa/ivopts-outside-loop-use-1.c: New test.
 
 2013-11-06  Bin Cheng  bin.ch...@arm.com
 
   * tree-ssa-loop-ivopts.c (iv_use_p, iv_cand_p): Move around.
   (iv_use_location): New.
   (struct iv): Remove have_use_for and use_id.  New fields
   inside_use, outside_uses_vec and use_loc.
   (struct iv_use): New fields exit_edge and outside_use_p.
   (struct edge_info, edge_info_p): New.
   (struct ivopts_data): New fields alloc_uses_vecs, changed_bbs,
   edge_map and edge_obstack.
   (init_edge_info, get_edge_info): New.
   (dump_use): Dump outside/inside information for iv use.
   (tree_ssa_iv_optimize_init): Init new fields.
   (alloc_iv): Init new fields.  Remove have_use_for and use_id.
   (record_use): New parameter.  Record information for outside loop
   iv use.
   (find_interesting_uses_op): New parameter.  Handle inside and
   outside loop iv uses.
   (find_interesting_uses_cond, idx_record_use): Pass new argument.
   (find_interesting_uses_address): Likewise.
   (find_interesting_uses_stmt, create_new_iv): likewise.
   (find_interesting_uses_outside): Rename exit to exit_edge.
   New parameter normal_edge_p.  Pass new argument.
   (find_interesting_uses): Find iv uses in two passes.
   (get_computation): Compute cost at right position for iv use.
   (determine_use_iv_cost_generic): Ajust cost for outside loop iv use.
   (rewrite_use_outside_of_loop): New.
   (rewrite_use): Call rewrite_use_outside_of_loop.
   (remove_unused_ivs): Keep computation only for inner iv use.
   (free_loop_data):  Reset outside_uses_vec in various iv structures.
   Free alloc_uses_vecs and edge_map.
   (tree_ssa_iv_optimize_finalize): Free and reset.
   (tree_ssa_iv_optimize_loop): Create edge_map.
   (tree_ssa_iv_optimize): Call rewrite_into_loop_closed_ssa if
   necessary.





Fix couple of ACATS failures

2013-11-21 Thread Eric Botcazou
This fixes the couple of ACATS failures

=== acats tests ===
FAIL:   c45531j
FAIL:   c45531l

=== acats Summary ===
# of expected passes2318
# of unexpected failures2

introduced by Jeff's latest threading patches.  The Tree-SSA tail merging pass 
was wrongly spotting basic blocks as duplicate but one of them had a statement 
that could throw and the other hadn't.

Tested on x86_64-suse-linux, applied on the mainline as obvious.


2013-11-21  Eric Botcazou  ebotca...@adacore.com

* tree-ssa-tail-merge.c (stmt_local_def): Return false if the statement
could throw.


-- 
Eric BotcazouIndex: tree-ssa-tail-merge.c
===
--- tree-ssa-tail-merge.c	(revision 205090)
+++ tree-ssa-tail-merge.c	(working copy)
@@ -309,6 +309,7 @@ stmt_local_def (gimple stmt)
   def_operand_p def_p;
 
   if (gimple_has_side_effects (stmt)
+  || stmt_could_throw_p (stmt)
   || gimple_vdef (stmt) != NULL_TREE)
 return false;
 

[PATCH] Adjust nb_iterations_upper_bound in loop header copying

2013-11-21 Thread Richard Biener

This patch decrements nb_iterations_upper_bound by one after we copied
the loop header.  This allows niter + 1 to more often not overflow.

Bootstrapped and tested on x86_64-unknown-linux-gnu, installed to trunk.

Richard.

2013-11-21  Richard Biener  rguent...@suse.de

* tree-ssa-loop-ch.c (copy_loop_headers): Decrement
nb_iterations_upper_bound by one.

Index: gcc/tree-ssa-loop-ch.c
===
--- gcc/tree-ssa-loop-ch.c  (revision 205097)
+++ gcc/tree-ssa-loop-ch.c  (working copy)
@@ -243,6 +243,16 @@ copy_loop_headers (void)
 are not now, since there was the loop exit condition.  */
   split_edge (loop_preheader_edge (loop));
   split_edge (loop_latch_edge (loop));
+
+  /* We peeled off one iteration of the loop thus we can lower
+the maximum number of iterations if we have a previously
+recorded value for that.  */
+  double_int max;
+  if (get_max_loop_iterations (loop, max))
+   {
+ max -= double_int_one;
+ loop-nb_iterations_upper_bound = max;
+   }
 }
 
   update_ssa (TODO_update_ssa);


[ping] Improve debug info for small structures (2)

2013-11-21 Thread Eric Botcazou
The submission is at http://gcc.gnu.org/ml/gcc-patches/2013-10/msg02007.html

Thanks in advance.

-- 
Eric Botcazou


[PATCH] Move niter computes around in preparation for 59058 fix

2013-11-21 Thread Richard Biener

This applies some TLC to the vectorizers various niter and related
computes.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.

Richard.

2013-11-21  Richard Biener  rguent...@suse.de

* tree-vect-loop-manip.c (vect_build_loop_niters,
vect_generate_tmps_on_preheader): Move ...
* tree-vect-loop.c (vect_build_loop_niters,
vect_generate_tmps_on_preheader): ... here and simplify.
(vect_transform_loop): Call them here and pass down results
to consumers.
* tree-vect-loop-manip.c (vect_do_peeling_for_loop_bound):
Get niter variables from caller.
(vect_do_peeling_for_alignment): Likewise.
* tree-vectorizer.h (vect_generate_tmps_on_preheader): Remove.
(vect_do_peeling_for_loop_bound, vect_do_peeling_for_alignment):
Adjust prototypes.

Index: gcc/tree-vect-loop-manip.c
===
*** gcc/tree-vect-loop-manip.c  (revision 205118)
--- gcc/tree-vect-loop-manip.c  (working copy)
*** find_loop_location (struct loop *loop)
*** 1400,1550 
  }
  
  
- /* This function builds ni_name = number of iterations loop executes
-on the loop preheader.  If SEQ is given the stmt is instead emitted
-there.  */
- 
- static tree
- vect_build_loop_niters (loop_vec_info loop_vinfo, gimple_seq seq)
- {
-   tree ni_name, var;
-   gimple_seq stmts = NULL;
-   edge pe;
-   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
- 
-   var = create_tmp_var (TREE_TYPE (ni), niters);
-   ni_name = force_gimple_operand (ni, stmts, false, var);
- 
-   pe = loop_preheader_edge (loop);
-   if (stmts)
- {
-   if (seq)
-   gimple_seq_add_seq (seq, stmts);
-   else
-   {
- basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
- gcc_assert (!new_bb);
-   }
- }
- 
-   return ni_name;
- }
- 
- 
- /* This function generates the following statements:
- 
-  ni_name = number of iterations loop executes
-  ratio = ni_name / vf
-  ratio_mult_vf_name = ratio * vf
- 
-  and places them at the loop preheader edge or in COND_EXPR_STMT_LIST
-  if that is non-NULL.  */
- 
- void
- vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
-tree *ni_name_ptr,
-tree *ratio_mult_vf_name_ptr,
-tree *ratio_name_ptr,
-gimple_seq cond_expr_stmt_list)
- {
- 
-   edge pe;
-   basic_block new_bb;
-   gimple_seq stmts;
-   tree ni_name, ni_minus_gap_name;
-   tree var;
-   tree ratio_name;
-   tree ratio_mult_vf_name;
-   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-   tree ni = LOOP_VINFO_NITERS (loop_vinfo);
-   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-   tree log_vf;
- 
-   pe = loop_preheader_edge (loop);
- 
-   /* Generate temporary variable that contains
-  number of iterations loop executes.  */
- 
-   ni_name = vect_build_loop_niters (loop_vinfo, cond_expr_stmt_list);
-   log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
- 
-   /* If epilogue loop is required because of data accesses with gaps, we
-  subtract one iteration from the total number of iterations here for
-  correct calculation of RATIO.  */
-   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
- {
-   ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
-  ni_name,
-  build_one_cst (TREE_TYPE (ni_name)));
-   if (!is_gimple_val (ni_minus_gap_name))
-   {
- var = create_tmp_var (TREE_TYPE (ni), ni_gap);
- 
-   stmts = NULL;
-   ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, stmts,
-   true, var);
-   if (cond_expr_stmt_list)
- gimple_seq_add_seq (cond_expr_stmt_list, stmts);
-   else
- {
-   pe = loop_preheader_edge (loop);
-   new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
-   gcc_assert (!new_bb);
- }
- }
- }
-   else
- ni_minus_gap_name = ni_name;
- 
-   /* Create: ratio = ni  log2(vf) */
- 
-   ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_minus_gap_name),
-   ni_minus_gap_name, log_vf);
-   if (!is_gimple_val (ratio_name))
- {
-   var = create_tmp_var (TREE_TYPE (ni), bnd);
- 
-   stmts = NULL;
-   ratio_name = force_gimple_operand (ratio_name, stmts, true, var);
-   if (cond_expr_stmt_list)
-   gimple_seq_add_seq (cond_expr_stmt_list, stmts);
-   else
-   {
- pe = loop_preheader_edge (loop);
- new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
- gcc_assert (!new_bb);
-   }
- }
- 
-   /* Create: ratio_mult_vf = ratio  log2 (vf).  */
- 
-   ratio_mult_vf_name = 

Re: [RFC] Old school parallelization of WPA streaming

2013-11-21 Thread Richard Biener
On Thu, 21 Nov 2013, Jan Hubicka wrote:

 Hi,
 I am not sure where we converged concerning the fork trick.  I am using it in 
 my
 tree for months and it does save my waiting time for WPA compilations, so I am
 re-attaching the patch.
 
 Does it seem resonable for mainline?
 
 As for other plans mentioned on this thread
   
   I still have some items on list here
1) avoid function sections to be decompressed by WPA
   (this won't cause much compile time improvements as decompression is
well bellow 10% of runtime)
  
  still low-hanging
  
  finally get a LTO section header!  (with a flag telling whether the
  section is compressed)
 
 I have patch for it somewhere (not particularly clean, we need to dig more 
 into
 the basic section handling code in LTO). The benefits however was quite small
 (we get dominated by decls and types still), so perhaps this can wait for next
 stage1 or a development branch.
  
2) put variable initializers into named sections just as function bodies
   are.
   Seeing Martin's systemtaps of firefox/gimp/inkscape, to my surprise 
   the
   initializers are actually about as big as the text segment.  While
   it seems bit wasteful to pust single integer_cst there (and we can
   special case this), it seems that there is a promise for vtables
   and other stuff.
   
   To make devirt work, we will need to load vtables into memory (or
   invent representation to stream them other way that would be similarly
   big). Still we will avoid need to load them in 5000 copies and merge
   them.
 
 Did not fnish this, unfortunately (devirtualization was more involved and
 I lost track on this one).  I had a prototype working where savings was about
 15% of WPA memory.  I will try to get cleaner implementation soon.
 
3) I think good part of function/partitioning overhead is because 
   abstract
   origin streaming is utterly broken.
 
 Yep, this is definitely still in longer term plans only.

Why do you need an additional -fparallelism?  Wouldn't
-fwpa=... be a better match, matching -flto=...?  As we already
pass down a -fwpa option to WPA this would make things easier, no?

Thanks,
Richard.

 Honza
 
   * lto-cgraph.c (asm_nodes_output): Make global.
   * lto-streamer.h (asm_nodes_output): Declare.
   * lto-wrapper.c (parallel, jobserver): Make global.
   (run_gcc): Pass down -fparallelism
 
   * lto.c (lto_parallelism): New variable.
   (do_stream_out): New function.
   (stream_out): New function.
   (lto_wpa_write_files): Use it.
   * lang.opt (fparallelism): New.
   * lto.h (lto_parallelism): Declare.
   * lto-lang.c (lto_handle_option): Add fparalelism.
 
 Index: lto-cgraph.c
 ===
 --- lto-cgraph.c  (revision 201891)
 +++ lto-cgraph.c  (working copy)
 @@ -50,6 +50,9 @@ along with GCC; see the file COPYING3.
  #include context.h
  #include pass_manager.h
  
 +/* True when asm nodes has been output.  */
 +bool asm_nodes_output = false;
 +
  static void output_cgraph_opt_summary (void);
  static void input_cgraph_opt_summary (vecsymtab_node  nodes);
  
 @@ -852,7 +855,6 @@ output_symtab (void)
lto_symtab_encoder_iterator lsei;
int i, n_nodes;
lto_symtab_encoder_t encoder;
 -  static bool asm_nodes_output = false;
  
if (flag_wpa)
  output_cgraph_opt_summary ();
 Index: lto-streamer.h
 ===
 --- lto-streamer.h(revision 201891)
 +++ lto-streamer.h(working copy)
 @@ -870,6 +870,7 @@ void lto_output_location (struct output_
  
  
  /* In lto-cgraph.c  */
 +extern bool asm_nodes_output;
  lto_symtab_encoder_t lto_symtab_encoder_new (bool);
  int lto_symtab_encoder_encode (lto_symtab_encoder_t, symtab_node);
  void lto_symtab_encoder_delete (lto_symtab_encoder_t);
 Index: lto-wrapper.c
 ===
 --- lto-wrapper.c (revision 201891)
 +++ lto-wrapper.c (working copy)
 @@ -56,6 +56,9 @@ along with GCC; see the file COPYING3.
  
  int debug;   /* true if -save-temps.  */
  int verbose; /* true if -v.  */
 +int parallel = 0;/* number of parallel builds specified
 +by -flto=N  */
 +int jobserver = 0;   /* true if -flto=jobserver was used.  */
  
  enum lto_mode_d {
LTO_MODE_NONE, /* Not doing LTO.  */
 @@ -445,8 +448,6 @@ run_gcc (unsigned argc, char *argv[])
char *list_option_full = NULL;
const char *linker_output = NULL;
const char *collect_gcc, *collect_gcc_options;
 -  int parallel = 0;
 -  int jobserver = 0;
bool no_partition = false;
struct cl_decoded_option *fdecoded_options = NULL;
unsigned int fdecoded_options_count = 0;
 @@ -630,6 +631,16 @@ run_gcc (unsigned argc, char *argv[])
   

Re: [patch,libgfortran] Fix binary128 ERFC_SCALED

2013-11-21 Thread Andreas Schwab
FX fxcoud...@gmail.com writes:

 2013-11-20  Francois-Xavier Coudert  fxcoud...@gcc.gnu.org

   PR libfortran/59227

There is no connection to PR59227

Andreas.

-- 
Andreas Schwab, SUSE Labs, sch...@suse.de
GPG Key fingerprint = 0196 BAD8 1CE9 1970 F4BE  1748 E4D4 88E3 0EEA B9D7
And now for something completely different.


Re: [PATCH, MPX, 2/X] Pointers Checker [14/25] Function splitting

2013-11-21 Thread Ilya Enkovich
2013/11/20 Richard Biener richard.guent...@gmail.com:
 On Wed, Nov 20, 2013 at 10:57 AM, Richard Biener
 richard.guent...@gmail.com wrote:
 On Tue, Nov 19, 2013 at 9:18 PM, Ilya Enkovich enkovich@gmail.com 
 wrote:
 2013/11/19 Jeff Law l...@redhat.com:
 On 11/19/13 05:20, Ilya Enkovich wrote:

 2013/11/19 Richard Biener richard.guent...@gmail.com:

 On Mon, Nov 18, 2013 at 8:12 PM, Ilya Enkovich enkovich@gmail.com
 wrote:

 2013/11/18 Jeff Law l...@redhat.com:

 On 11/18/13 11:27, Ilya Enkovich wrote:



 How does pointer passed to regular function differ from pointer passed
 to splitted function? How do I know then which pointer is to be passed
 with bounds and wchich one is not? Moreover current ABI does not allow
 to pass bounds with no pointer or pass bounds for some pointers in the
 call only.


 But I don't see any case in function splitting where we're going to
 want to
 pass the pointer without the bounds.  If you want the former, you're
 going
 to want the latter.


 There are at least cases when checks are eliminated or when lots of
 pointer usages are accompanied with few checks performed earlier (e.g.
 we are working with array). In such cases splitted part may easily get
 no bounds.


 I really don't see why you need to do anything special here.  At the
 most an
 assert in the splitting code to ensure that you don't have a situation
 where
 there's mixed pointers with bounds and pointers without bounds should
 be all
 you need or that you passed a bounds with no associated pointer :-)


 It would also require generation of proper bind_bounds calls in the
 original function and arg_bounds calls in a separated part. So,
 special support is required.


 Well, only to keep proper instrumentation.  I hope code still works
 (doesn't trap) when optimizations wreck the bounds?  Thus all
 these patches are improving bounds propagation but are not required
 for correctness?  If so please postpone all of them until after the
 initial support is merged.  If not, please make sure BND instrumentation
 works conservatively when optimizations wreck it.


 All patches I sent for optimization passes are required to avoid ICEs
 when compiling instrumented code.

 Then I think we're going to need to understand them in more detail. That's
 going to mean testcases, probably dumps and some commentary about what went
 wrong.

 I can't speak for Richi, but when optimizations get disabled, I tend to 
 want
 to really understand why and make sure we're not papering over a larger
 problem.

 The tail recursion elimination one we're discussing now is a great example.
 At this point I understand the problem you're running into, but I'm still
 trying to wrap my head around the implications of the funny semantics of
 __builtin_arg_bounds and how they may cause other problems.

 Root of all problems if implicit data flow hidden in arg_bounds and
 bind_bounds.  Calls consume bounds and compiler does not know it. And
 input bounds are always expressed via arg_bounds calls and never
 expressed via formal args. Obviously optimizers have to be taught
 about these data dependencies to work correctly.

 I agree semantics of arg_bounds call creates many issues for
 optimizers but currently I do not see a better replacement for it.

 But it looks incredibly fragile if you ICE once something you don't like
 happens.  You should be able to easily detect the case and punt,
 that is, drop to non-instrumented aka invalidating bounds.

 Thus, I really really don't like these patches.  They hint at some
 deeper problem with the overall design (or the HW feature or the
 accompaning ABI).

 Note that this, the intrusiveness of the feature and the questionable
 gain makes me question whether GCC should have support for this
 feature (and whether we really should rush this in this late).

 Thus, I hereby formally ask to push back this feature to 4.10.

I think you overestimate the intrusiveness of the checker. Necessity
of changes in optimization passes is artificial and is used to get
maximum checking quality. It can be easily made immune for different
code transformation by simple changes in the process of
instrumentation expand (I have a fix for that already). With that
change only pass itself, support for bound args during expand, support
in i386 target and minor infrastructure changes are required (e.g.
flag in varpool_node, bounds_constants). Changes in inline,
propagation, SRA, tail recursion, strlen, function splitting, string
function builtins expand would become optional and affect checking
quality only.

Also note that all changes do not affect compilation process when no
instrumentation is used.

Please reconsider your decision about pushing it to 4.10 taking that
into account.

Thanks,
Ilya


 Thanks,
 Richard.

 Richard.

 Ilya



 jeff



RE: [PING][PATCH] LRA: check_rtl modifies RTL instruction stream

2013-11-21 Thread Robert Suchanek
Thanks.  Vlad may not be available right now, and even if he is, he's 
probably typing one-handed.

So I took care of installing this for you.

Thanks,
Jeff

Thanks!

Regards,
Robert




Re: [Patch, ARM] New feature to minimize the literal load for armv7-m target

2013-11-21 Thread Richard Earnshaw
On 21/11/13 02:53, Terry Guo wrote:
 BR,
 Terry
 
 2013-11-21  Terry Guo  terry@arm.com
 
  * doc/invoke.texi (-mslow-flash-data): Document new option.
  * config/arm/arm.opt (mslow-flash-data): New option.
  * config/arm/arm-protos.h
 (arm_max_const_double_inline_cost): Declare it.
  * config/arm/arm.h (TARGET_USE_MOVT): Always true when
  literal pools are disabled.
  (arm_disable_literal_pool): Declare it.
  * config/arm/arm.c (arm_disable_literal_pool): New
 variable.
  (arm_option_override): Handle new option.
  (thumb2_legitimate_address_p): Invalidate memory operand
  (mem (symbol_ref )) to avoid the use of literal pools
 when literal
  pools are disabled.
Don't allow symbol references when literal pools are disabled.

  (arm_max_const_double_inline_cost): New function.
  * config/arm/arm.md (types.md): Include it before ...
  (use_literal_pool): New attribute.
  (enabled): Use new attribute.
  (split pattern): Replace symbol+offset with MOVW/MOVT.
 
 

 diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
 index 0d68f01..f453309 100644
 --- a/gcc/config/arm/arm.c
 +++ b/gcc/config/arm/arm.c
 @@ -869,6 +869,9 @@ int arm_arch_thumb_hwdiv;
 than core registers.  */
  int prefer_neon_for_64bits = 0;
  
 +/* Nonzero if we shouldn't use literal pools.  */
 +bool arm_disable_literal_pool = 0;
 +

Use false, not 0.

 @@ -2573,6 +2576,16 @@ arm_option_override (void)
if (TARGET_APCS_FRAME)
  flag_shrink_wrap = false;
  
 +  /* We only support -mslow-flash-data on armv7-m targets.  */
 +  if (target_slow_flash_data
 +   ((!(arm_arch7  !arm_arch_notm)  !arm_arch7em)
 +   || (TARGET_THUMB1 || flag_pic || TARGET_NEON)))
 +error (-mslow-flash-data only supports non-pic code on armv7-m 
 targets);
 +
 +  /* Currently, for slow flash data, we just disable literal pools.  */
 +  if (target_slow_flash_data)
 +arm_disable_literal_pool = 1;
 +

Use true.

 diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
 index 6fc56b9..071e0c5 100644
 --- a/gcc/doc/invoke.texi
 +++ b/gcc/doc/invoke.texi
 @@ -12326,6 +12327,13 @@ Enables using Neon to handle scalar 64-bits 
 operations. This is
  disabled by default since the cost of moving data from core registers
  to Neon is high.
  
 +@item -mslow-flash-data
 +@opindex mslow-flash-data
 +The v7 m-profile only option.

This option is only supported when compiling for ARMv7 M-profile.

OK with those changes.

R.



Re: [RFC] Old school parallelization of WPA streaming

2013-11-21 Thread Jan Hubicka
 
 Why do you need an additional -fparallelism?  Wouldn't
 -fwpa=... be a better match, matching -flto=...?  As we already
 pass down a -fwpa option to WPA this would make things easier, no?

My plan was to possibly use same option later for parallelizing more parts of
compiler, not only WPA streaming. Streaming in may have some chance if we get
into thread safety of GGC or move sufficient amount of stuff out of GGC.  Also
we can parallelize inliner heuristic or IPA-PTA if it will ever work. So it
would make sense with -flto-partition=none and perhaps with local optimization,
too.

But I can definitely update the patch to use -fwpa=N and we can deal with this
once this becomes real. (i.e. I have no clue how to parallelize inliner without
making its decisions dependent on the parallelizm and declining with parallelizm
increased nor I have real plans for stream in procedure)

Honza
 
 Thanks,
 Richard.
 
  Honza
  
  * lto-cgraph.c (asm_nodes_output): Make global.
  * lto-streamer.h (asm_nodes_output): Declare.
  * lto-wrapper.c (parallel, jobserver): Make global.
  (run_gcc): Pass down -fparallelism
  
  * lto.c (lto_parallelism): New variable.
  (do_stream_out): New function.
  (stream_out): New function.
  (lto_wpa_write_files): Use it.
  * lang.opt (fparallelism): New.
  * lto.h (lto_parallelism): Declare.
  * lto-lang.c (lto_handle_option): Add fparalelism.
  
  Index: lto-cgraph.c
  ===
  --- lto-cgraph.c(revision 201891)
  +++ lto-cgraph.c(working copy)
  @@ -50,6 +50,9 @@ along with GCC; see the file COPYING3.
   #include context.h
   #include pass_manager.h
   
  +/* True when asm nodes has been output.  */
  +bool asm_nodes_output = false;
  +
   static void output_cgraph_opt_summary (void);
   static void input_cgraph_opt_summary (vecsymtab_node  nodes);
   
  @@ -852,7 +855,6 @@ output_symtab (void)
 lto_symtab_encoder_iterator lsei;
 int i, n_nodes;
 lto_symtab_encoder_t encoder;
  -  static bool asm_nodes_output = false;
   
 if (flag_wpa)
   output_cgraph_opt_summary ();
  Index: lto-streamer.h
  ===
  --- lto-streamer.h  (revision 201891)
  +++ lto-streamer.h  (working copy)
  @@ -870,6 +870,7 @@ void lto_output_location (struct output_
   
   
   /* In lto-cgraph.c  */
  +extern bool asm_nodes_output;
   lto_symtab_encoder_t lto_symtab_encoder_new (bool);
   int lto_symtab_encoder_encode (lto_symtab_encoder_t, symtab_node);
   void lto_symtab_encoder_delete (lto_symtab_encoder_t);
  Index: lto-wrapper.c
  ===
  --- lto-wrapper.c   (revision 201891)
  +++ lto-wrapper.c   (working copy)
  @@ -56,6 +56,9 @@ along with GCC; see the file COPYING3.
   
   int debug; /* true if -save-temps.  */
   int verbose;   /* true if -v.  */
  +int parallel = 0;  /* number of parallel builds specified
  +  by -flto=N  */
  +int jobserver = 0; /* true if -flto=jobserver was used.  */
   
   enum lto_mode_d {
 LTO_MODE_NONE,   /* Not doing LTO.  */
  @@ -445,8 +448,6 @@ run_gcc (unsigned argc, char *argv[])
 char *list_option_full = NULL;
 const char *linker_output = NULL;
 const char *collect_gcc, *collect_gcc_options;
  -  int parallel = 0;
  -  int jobserver = 0;
 bool no_partition = false;
 struct cl_decoded_option *fdecoded_options = NULL;
 unsigned int fdecoded_options_count = 0;
  @@ -630,6 +631,16 @@ run_gcc (unsigned argc, char *argv[])
if (parallel = 1)
  parallel = 0;
  }
  + if (jobserver)
  +   {
  + obstack_ptr_grow (argv_obstack, xstrdup 
  (-fparallelism=jobserver));
  +   }
  + else if (parallel  1)
  +   {
  + char buf[256];
  + sprintf (buf, -fparallelism=%i, parallel);
  + obstack_ptr_grow (argv_obstack, xstrdup (buf));
  +   }
/* Fallthru.  */
   
  case OPT_flto:
  Index: lto/lto.c
  ===
  --- lto/lto.c   (revision 201891)
  +++ lto/lto.c   (working copy)
  @@ -49,6 +49,9 @@ along with GCC; see the file COPYING3.
   #include context.h
   #include pass_manager.h
   
  +/* Number of parallel tasks to run, -1 if we want to use GNU Make 
  jobserver.  */
  +int lto_parallelism;
  +
   static GTY(()) tree first_personality_decl;
   
   /* Returns a hash code for P.  */
  @@ -3002,6 +3005,98 @@ cmp_partitions_order (const void *a, con
 return orderb - ordera;
   }
   
  +/* Actually stream out ENCODER into TEMP_FILENAME.  */
  +
  +void
  +do_stream_out (char *temp_filename, lto_symtab_encoder_t encoder)
  +{
  +  lto_file *file = lto_obj_file_open (temp_filename, true);
  +  if (!file)
  +

Re: [SH] PR 30807 - Add test case

2013-11-21 Thread Hans-Peter Nilsson
On Tue, 5 Nov 2013, Mike Stump wrote:
 On Nov 5, 2013, at 1:45 PM, Oleg Endo oleg.e...@t-online.de wrote:
  You're right,  it's redundant.  It should be just
  /* { dg-do compile } */
 
  shouldn't it?

 Yup, that's my take.

Or nothing at all, as compile seems to be the default here.
(grep for dg-do-what-default)

brgds, H-P


Re: [fortran, patch] Add Fortran 2003 IEEE intrinsic modules

2013-11-21 Thread N.M. Maclaren

On Nov 21 2013, FX wrote:


Note: Gradual underflow control is implemented as not supported by the 
processor (its SUPPORT function returns false, and the GET and SET 
procedures abort if you call them), because we probably don't have 
targets where it would work (and I don't think people use it much, if at 
all). That's explicitly allowed by the standard.


That's a reasonable decision, but it is actually used at least ten times
as much as everything else put together, possibly a hundred times as much.
However, it is used in the form of selecting hard underflow using a
compilation option, and not within the program.  You certainly DO have
targets where it would work, even dynamically within the program, and I
think that it could be done even on x86.  That isn't the same as it
should be done, of course!

I should have to study the various architectures to remind myself of how
to do it, but there were some which had a simple flag and I am pretty
sure that they are among gfortran's current targets.  How many of the
CPUs support that flag is another matter   On at least the x86, it
would have to be done by writing a floating-point interrupt handler,
which I think can be done without privilege and could be made to work.
It wouldn't be a lot of code, but would need some very low-level hacking.

It is getting rarer, as the support of denormalised numbers gets less
direly inefficient and they cause less chaos in the libraries, but
there are still codes that need hard underflow and quite a few algorithms
that work better with it.


Regards,
Nick Maclaren.



Re: [RFC] Old school parallelization of WPA streaming

2013-11-21 Thread Richard Biener
On Thu, 21 Nov 2013, Jan Hubicka wrote:

  
  Why do you need an additional -fparallelism?  Wouldn't
  -fwpa=... be a better match, matching -flto=...?  As we already
  pass down a -fwpa option to WPA this would make things easier, no?
 
 My plan was to possibly use same option later for parallelizing more parts of
 compiler, not only WPA streaming. Streaming in may have some chance if we get
 into thread safety of GGC or move sufficient amount of stuff out of GGC.  Also
 we can parallelize inliner heuristic or IPA-PTA if it will ever work. So it
 would make sense with -flto-partition=none and perhaps with local 
 optimization,
 too.

I'd like to drop -flto-partition=none eventually.  It's just one more
path through the compiler to support ...

 But I can definitely update the patch to use -fwpa=N and we can deal with this
 once this becomes real. (i.e. I have no clue how to parallelize inliner 
 without
 making its decisions dependent on the parallelizm and declining with 
 parallelizm
 increased nor I have real plans for stream in procedure)

Please.

Richard.

 Honza
  
  Thanks,
  Richard.
  
   Honza
   
 * lto-cgraph.c (asm_nodes_output): Make global.
 * lto-streamer.h (asm_nodes_output): Declare.
 * lto-wrapper.c (parallel, jobserver): Make global.
 (run_gcc): Pass down -fparallelism
   
 * lto.c (lto_parallelism): New variable.
 (do_stream_out): New function.
 (stream_out): New function.
 (lto_wpa_write_files): Use it.
 * lang.opt (fparallelism): New.
 * lto.h (lto_parallelism): Declare.
 * lto-lang.c (lto_handle_option): Add fparalelism.
   
   Index: lto-cgraph.c
   ===
   --- lto-cgraph.c  (revision 201891)
   +++ lto-cgraph.c  (working copy)
   @@ -50,6 +50,9 @@ along with GCC; see the file COPYING3.
#include context.h
#include pass_manager.h

   +/* True when asm nodes has been output.  */
   +bool asm_nodes_output = false;
   +
static void output_cgraph_opt_summary (void);
static void input_cgraph_opt_summary (vecsymtab_node  nodes);

   @@ -852,7 +855,6 @@ output_symtab (void)
  lto_symtab_encoder_iterator lsei;
  int i, n_nodes;
  lto_symtab_encoder_t encoder;
   -  static bool asm_nodes_output = false;

  if (flag_wpa)
output_cgraph_opt_summary ();
   Index: lto-streamer.h
   ===
   --- lto-streamer.h(revision 201891)
   +++ lto-streamer.h(working copy)
   @@ -870,6 +870,7 @@ void lto_output_location (struct output_


/* In lto-cgraph.c  */
   +extern bool asm_nodes_output;
lto_symtab_encoder_t lto_symtab_encoder_new (bool);
int lto_symtab_encoder_encode (lto_symtab_encoder_t, symtab_node);
void lto_symtab_encoder_delete (lto_symtab_encoder_t);
   Index: lto-wrapper.c
   ===
   --- lto-wrapper.c (revision 201891)
   +++ lto-wrapper.c (working copy)
   @@ -56,6 +56,9 @@ along with GCC; see the file COPYING3.

int debug;   /* true if -save-temps.  */
int verbose; /* true if -v.  */
   +int parallel = 0;/* number of parallel builds 
   specified
   +by -flto=N  */
   +int jobserver = 0;   /* true if -flto=jobserver was 
   used.  */

enum lto_mode_d {
  LTO_MODE_NONE, /* Not doing LTO.  */
   @@ -445,8 +448,6 @@ run_gcc (unsigned argc, char *argv[])
  char *list_option_full = NULL;
  const char *linker_output = NULL;
  const char *collect_gcc, *collect_gcc_options;
   -  int parallel = 0;
   -  int jobserver = 0;
  bool no_partition = false;
  struct cl_decoded_option *fdecoded_options = NULL;
  unsigned int fdecoded_options_count = 0;
   @@ -630,6 +631,16 @@ run_gcc (unsigned argc, char *argv[])
   if (parallel = 1)
 parallel = 0;
 }
   +   if (jobserver)
   + {
   +   obstack_ptr_grow (argv_obstack, xstrdup 
   (-fparallelism=jobserver));
   + }
   +   else if (parallel  1)
   + {
   +   char buf[256];
   +   sprintf (buf, -fparallelism=%i, parallel);
   +   obstack_ptr_grow (argv_obstack, xstrdup (buf));
   + }
   /* Fallthru.  */

 case OPT_flto:
   Index: lto/lto.c
   ===
   --- lto/lto.c (revision 201891)
   +++ lto/lto.c (working copy)
   @@ -49,6 +49,9 @@ along with GCC; see the file COPYING3.
#include context.h
#include pass_manager.h

   +/* Number of parallel tasks to run, -1 if we want to use GNU Make 
   jobserver.  */
   +int lto_parallelism;
   +
static GTY(()) tree first_personality_decl;

/* Returns a hash code for P.  */
   @@ -3002,6 +3005,98 @@ cmp_partitions_order (const void *a, con
 

[PATCH] Improve { x, x + 3, x + 6, x + 9 } expansion (take 2)

2013-11-21 Thread Jakub Jelinek
On Thu, Nov 21, 2013 at 07:43:35AM +1000, Richard Henderson wrote:
 On 11/20/2013 07:44 PM, Jakub Jelinek wrote:
  On Wed, Nov 20, 2013 at 10:31:38AM +0100, Richard Biener wrote:
  Aww ;)  Nice improvement.  Generally when I see this I always wonder
  whether we want to do this kind of stuff pre RTL expansion.
  1st to not rely on being able to TER, 2nd to finally eventually
  get rid of TER.
 
  These patches are unfortunately a step backward for #2.
 
  As of the patch, do we have a way to query whether the target
  can efficiently broadcast?  If so this IMHO belongs in generic
  
  We don't.  Perhaps if we'd add optab for vec_dupmode and mentioned
  clearly in the documentation that it should be used only if it is reasonably
  efficient.  But still, even with optab, it would probably better to do it
  in the veclower* passes than in the vectorizer itself.
 
 I think we can assume that broadcast is relatively efficient, whether or not
 vec_dup is present.  I'd lean to making the transformation generic to start
 with, so that you don't need extra handling in the i386 backend.

Ok, here is a generic veclower implementation without looking at any optabs,
so far only handles PLUS_EXPR, what operation other than MULT_EXPR would
make sense here?  Though, handling MULT_EXPR also would complicate the code
slightly (it would need to handle say:
  _2 = _1(D) + 1;
  _3 = _2 + 2;
  _4 = _3 * 2;
  _5 = _4 * 3;
  _6 = { _3, _4, _5, _4 };
where we could start thinking first the operation is PLUS_EXPR, but it
actually is MULT_EXPR with _3 as base).  Also, for MULT_EXPR, supposedly
we could handle some values to be constant 0, like in:
  _2 = _1(D) * 5;
  _3 = _2 * 2;
  _4 = _1(D) * 10;
  _5 = { _3, 0, _4, _2, _1(D), 0, _4, _2 };

Bootstrap/regtest pending, ok at least for this for the start and can be
improved later on?

2013-11-21  Jakub Jelinek  ja...@redhat.com

* tree-vect-generic.c (optimize_vector_constructor): New function.
(expand_vector_operations_1): Call it.

* gcc.dg/vect/vect-124.c: New test.

--- gcc/tree-vect-generic.c.jj  2013-11-19 21:56:40.0 +0100
+++ gcc/tree-vect-generic.c 2013-11-21 11:17:55.146118161 +0100
@@ -988,6 +988,84 @@ expand_vector_operation (gimple_stmt_ite
gimple_assign_rhs1 (assign),
gimple_assign_rhs2 (assign), code);
 }
+
+/* Try to optimize
+   a_5 = { b_7, b_7 + 3, b_7 + 6, b_7 + 9 };
+   style stmts into:
+   _9 = { b_7, b_7, b_7, b_7 };
+   a_5 = _9 + { 0, 3, 6, 9 };
+   because vector splat operation is usually more efficient
+   than piecewise initialization of the vector.  */
+
+static void
+optimize_vector_constructor (gimple_stmt_iterator *gsi)
+{
+  gimple stmt = gsi_stmt (*gsi);
+  tree lhs = gimple_assign_lhs (stmt);
+  tree rhs = gimple_assign_rhs1 (stmt);
+  tree type = TREE_TYPE (rhs);
+  unsigned int i, j, nelts = TYPE_VECTOR_SUBPARTS (type);
+  bool all_same = true;
+  constructor_elt *elt;
+  tree *cst;
+  gimple g;
+  tree base = NULL_TREE;
+
+  if (nelts = 2 || CONSTRUCTOR_NELTS (rhs) != nelts)
+return;
+  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (rhs), i, elt)
+if (TREE_CODE (elt-value) != SSA_NAME
+   || TREE_CODE (TREE_TYPE (elt-value)) == VECTOR_TYPE)
+  return;
+else
+  {
+   tree this_base = elt-value;
+   if (this_base != CONSTRUCTOR_ELT (rhs, 0)-value)
+ all_same = false;
+   for (j = 0; j  nelts + 1; j++)
+ {
+   g = SSA_NAME_DEF_STMT (this_base);
+   if (is_gimple_assign (g)
+gimple_assign_rhs_code (g) == PLUS_EXPR
+TREE_CODE (gimple_assign_rhs2 (g)) == INTEGER_CST
+TREE_CODE (gimple_assign_rhs1 (g)) == SSA_NAME
+!SSA_NAME_OCCURS_IN_ABNORMAL_PHI (gimple_assign_rhs1 (g)))
+ this_base = gimple_assign_rhs1 (g);
+   else
+ break;
+ }
+   if (i == 0)
+ base = this_base;
+   else if (this_base != base)
+ return;
+  }
+  if (all_same)
+return;
+  cst = XALLOCAVEC (tree, nelts);
+  for (i = 0; i  nelts; i++)
+{
+  tree this_base = CONSTRUCTOR_ELT (rhs, i)-value;;
+  cst[i] = build_zero_cst (TREE_TYPE (base));
+  while (this_base != base)
+   {
+ g = SSA_NAME_DEF_STMT (this_base);
+ cst[i] = fold_binary (PLUS_EXPR, TREE_TYPE (base),
+   cst[i], gimple_assign_rhs2 (g));
+ if (cst[i] == NULL_TREE
+ || TREE_CODE (cst[i]) != INTEGER_CST
+ || TREE_OVERFLOW (cst[i]))
+   return;
+ this_base = gimple_assign_rhs1 (g);
+   }
+}
+  for (i = 0; i  nelts; i++)
+CONSTRUCTOR_ELT (rhs, i)-value = base;
+  g = gimple_build_assign (make_ssa_name (type, NULL), rhs);
+  gsi_insert_before (gsi, g, GSI_SAME_STMT);
+  g = gimple_build_assign_with_ops (PLUS_EXPR, lhs, gimple_assign_lhs (g),
+   build_vector (type, cst));
+  

Re: [fortran, patch] Add Fortran 2003 IEEE intrinsic modules

2013-11-21 Thread FX
 That's a reasonable decision, but it is actually used at least ten times
 as much as everything else put together, possibly a hundred times as much.

I believe we are in pretty different parts of the community. Around me I rarely 
see it used, while people check for nans, infinities, and exception flags 
often. Also, aborting on certain floating-point exceptions is widely used as a 
debugging aid.

 However, it is used in the form of selecting hard underflow using a
 compilation option, and not within the program.  You certainly DO have
 targets where it would work, even dynamically within the program, and I
 think that it could be done even on x86.  That isn't the same as it
 should be done, of course!

Indeed, 387/SSE has flush-to-zero modes. But other APIs do not (glibc, SysV, 
AIX).
I’m perfectly willing to add it, especially to 387/SSE, if given a bit of help 
(someone to write the assembly code).

Thanks for your feedback,
FX

Fix vectorizer testsuite fallout

2013-11-21 Thread Jan Hubicka
Hi,
this patch fixes problem with missing dump files with -flto and vectorizer
and also silence error in gcc.dg/20081223-1.c testcase.  We ought to error
these even w/o -ffat-lto-objects, I will look into it ASAP.
We need to move these errors from varasm/wrapup into symtab finalization
that is obviously correct thing to do once we are fully unit-at-a-time
but it may contain surprises (from my past experiences).

Regtested ppc64-linux

* gcc.dg/20081223-1.c: Add -ffat-lto-objects.
* gcc.dg/vect/vect.exp: Add -ffat-lto-objects.
Index: gcc.dg/20081223-1.c
===
--- gcc.dg/20081223-1.c (revision 205132)
+++ gcc.dg/20081223-1.c (working copy)
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options -flto { target lto } }  */
+/* { dg-options -flto -ffat-lto-objects { target lto } }  */
 
 typedef struct foo_ foo_t;
 foo_t bar;  /* { dg-error storage size of 'bar' isn't known }  */
Index: gcc.dg/vect/vect.exp
===
--- gcc.dg/vect/vect.exp(revision 205132)
+++ gcc.dg/vect/vect.exp(working copy)
@@ -76,7 +76,7 @@ lappend VECT_SLP_CFLAGS -fdump-tree-slp
 # Main loop.
 set VECT_ADDITIONAL_FLAGS [list ]
 if { [check_effective_target_lto] } {
-lappend VECT_ADDITIONAL_FLAGS -flto
+lappend VECT_ADDITIONAL_FLAGS -flto -ffat-lto-objects
 }
 foreach flags $VECT_ADDITIONAL_FLAGS {
 dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/pr*.\[cS\]]]  \


Re: Implement C11 _Atomic

2013-11-21 Thread Hans-Peter Nilsson
On Tue, 5 Nov 2013, Joseph S. Myers wrote:

Thanks for doing this!  However, without examples I have trouble
reading out the bits I need as a target maintainer, and I can't
read out the answers from the patch, so pardon a few questions.

 This patch, relative to trunk and based on work done on the C11-atomic
 branch, adds support for C11 _Atomic.  It is intended to include all
 the required language support.

 It does not include the stdatomic.h header; there's a version on the
 branch, but it needs further review against the standard and test
 coverage adding to the testsuite before I can propose it for mainline.

 Support for atomic types having bigger alignment than the
 corresponding non-atomic types is limited: it includes the code to
 increase the alignment of types whose size is exactly 1, 2, 4, 8 or 16
 to that of the corresponding integer type [*], but not anything for
 target-specific alignment increases.

Target-maintainer perspective here: do I read that correctly,
that by default adding _Atomic raises the alignment of that type
to the natural one, for all targets?

To wit,

 There's code for target-specific
 alignment on the branch (and I intend to merge trunk back to the
 branch once this patch is on trunk, so it's easy to tell what the
 changes still left on the branch are), should any target maintainers
 wish to have such alignment.

...is that part needed for alignment that is only
target-specific and other-than-natural?  For example, 8-byte
aligment where required for atomic 4-byte types?

Or is that part also required for
anything-other-than-ordinary-C-type alignment for the target;
say, natural 4-byte alignment of 4-byte-types for targets where
alignment is otherwise packed; where only 1-byte alignment of
the basic type is ABI-mandated?

brgds, H-P


Re: [PATCH] Improve { x, x + 3, x + 6, x + 9 } expansion (take 2)

2013-11-21 Thread Richard Biener
On Thu, 21 Nov 2013, Jakub Jelinek wrote:

 On Thu, Nov 21, 2013 at 07:43:35AM +1000, Richard Henderson wrote:
  On 11/20/2013 07:44 PM, Jakub Jelinek wrote:
   On Wed, Nov 20, 2013 at 10:31:38AM +0100, Richard Biener wrote:
   Aww ;)  Nice improvement.  Generally when I see this I always wonder
   whether we want to do this kind of stuff pre RTL expansion.
   1st to not rely on being able to TER, 2nd to finally eventually
   get rid of TER.
  
   These patches are unfortunately a step backward for #2.
  
   As of the patch, do we have a way to query whether the target
   can efficiently broadcast?  If so this IMHO belongs in generic
   
   We don't.  Perhaps if we'd add optab for vec_dupmode and mentioned
   clearly in the documentation that it should be used only if it is 
   reasonably
   efficient.  But still, even with optab, it would probably better to do it
   in the veclower* passes than in the vectorizer itself.
  
  I think we can assume that broadcast is relatively efficient, whether or not
  vec_dup is present.  I'd lean to making the transformation generic to start
  with, so that you don't need extra handling in the i386 backend.
 
 Ok, here is a generic veclower implementation without looking at any optabs,
 so far only handles PLUS_EXPR, what operation other than MULT_EXPR would
 make sense here?  Though, handling MULT_EXPR also would complicate the code
 slightly (it would need to handle say:
   _2 = _1(D) + 1;
   _3 = _2 + 2;
   _4 = _3 * 2;
   _5 = _4 * 3;
   _6 = { _3, _4, _5, _4 };
 where we could start thinking first the operation is PLUS_EXPR, but it
 actually is MULT_EXPR with _3 as base).  Also, for MULT_EXPR, supposedly
 we could handle some values to be constant 0, like in:
   _2 = _1(D) * 5;
   _3 = _2 * 2;
   _4 = _1(D) * 10;
   _5 = { _3, 0, _4, _2, _1(D), 0, _4, _2 };
 
 Bootstrap/regtest pending, ok at least for this for the start and can be
 improved later on?

Ok, this should catch most of the vectorizer cases.

Zero could also be handled for PLUS_EXPR, likewise one for MULT_EXPR.
I think for induction it's common to have { base, base + 1, base + 2, ... 
}

more comments below

 2013-11-21  Jakub Jelinek  ja...@redhat.com
 
   * tree-vect-generic.c (optimize_vector_constructor): New function.
   (expand_vector_operations_1): Call it.
 
   * gcc.dg/vect/vect-124.c: New test.
 
 --- gcc/tree-vect-generic.c.jj2013-11-19 21:56:40.0 +0100
 +++ gcc/tree-vect-generic.c   2013-11-21 11:17:55.146118161 +0100
 @@ -988,6 +988,84 @@ expand_vector_operation (gimple_stmt_ite
   gimple_assign_rhs1 (assign),
   gimple_assign_rhs2 (assign), code);
  }
 +
 +/* Try to optimize
 +   a_5 = { b_7, b_7 + 3, b_7 + 6, b_7 + 9 };
 +   style stmts into:
 +   _9 = { b_7, b_7, b_7, b_7 };
 +   a_5 = _9 + { 0, 3, 6, 9 };
 +   because vector splat operation is usually more efficient
 +   than piecewise initialization of the vector.  */
 +
 +static void
 +optimize_vector_constructor (gimple_stmt_iterator *gsi)
 +{
 +  gimple stmt = gsi_stmt (*gsi);
 +  tree lhs = gimple_assign_lhs (stmt);
 +  tree rhs = gimple_assign_rhs1 (stmt);
 +  tree type = TREE_TYPE (rhs);
 +  unsigned int i, j, nelts = TYPE_VECTOR_SUBPARTS (type);
 +  bool all_same = true;
 +  constructor_elt *elt;
 +  tree *cst;
 +  gimple g;
 +  tree base = NULL_TREE;
 +
 +  if (nelts = 2 || CONSTRUCTOR_NELTS (rhs) != nelts)
 +return;
 +  FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (rhs), i, elt)
 +if (TREE_CODE (elt-value) != SSA_NAME
 + || TREE_CODE (TREE_TYPE (elt-value)) == VECTOR_TYPE)
 +  return;
 +else
 +  {
 + tree this_base = elt-value;
 + if (this_base != CONSTRUCTOR_ELT (rhs, 0)-value)
 +   all_same = false;
 + for (j = 0; j  nelts + 1; j++)
 +   {
 + g = SSA_NAME_DEF_STMT (this_base);
 + if (is_gimple_assign (g)
 +  gimple_assign_rhs_code (g) == PLUS_EXPR
 +  TREE_CODE (gimple_assign_rhs2 (g)) == INTEGER_CST
 +  TREE_CODE (gimple_assign_rhs1 (g)) == SSA_NAME
 +  !SSA_NAME_OCCURS_IN_ABNORMAL_PHI (gimple_assign_rhs1 (g)))
 +   this_base = gimple_assign_rhs1 (g);
 + else
 +   break;
 +   }

why loop here?  Do you want to catch base + 1 + 2?  I think that's
hiding a missed optimization elsewhere for no good reason.

 + if (i == 0)
 +   base = this_base;
 + else if (this_base != base)
 +   return;
 +  }
 +  if (all_same)
 +return;
 +  cst = XALLOCAVEC (tree, nelts);
 +  for (i = 0; i  nelts; i++)
 +{
 +  tree this_base = CONSTRUCTOR_ELT (rhs, i)-value;;
 +  cst[i] = build_zero_cst (TREE_TYPE (base));
 +  while (this_base != base)
 + {
 +   g = SSA_NAME_DEF_STMT (this_base);
 +   cst[i] = fold_binary (PLUS_EXPR, TREE_TYPE (base),
 + cst[i], gimple_assign_rhs2 (g));
 +   if (cst[i] == NULL_TREE
 +   || TREE_CODE (cst[i]) != 

[PATCH] Make forwprop fold series of VIEW_CONVERT_EXPRs

2013-11-21 Thread Richard Biener

This moves another fold-const.c folding to the GIMPLE level.
In PR59058 it was noticed we fail to optimize

  vect_vec_iv_.16_57 = VIEW_CONVERT_EXPRvector(8) short 
int(vect_vec_iv_.15_55);
  vect_b.17_58 = VIEW_CONVERT_EXPRvector(8) unsigned 
short(vect_vec_iv_.16_57);

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.

Richard.

2013-11-21  Richard Biener  rguent...@suse.de

* tree-ssa-forwprop.c (simplify_vce): New function.
(ssa_forward_propagate_and_combine): Call it.

Index: gcc/tree-ssa-forwprop.c
===
*** gcc/tree-ssa-forwprop.c (revision 205121)
--- gcc/tree-ssa-forwprop.c (working copy)
*** combine_conversions (gimple_stmt_iterato
*** 2994,2999 
--- 2994,3062 
return 0;
  }
  
+ /* Combine VIEW_CONVERT_EXPRs with their defining statement.  */
+ 
+ static bool
+ simplify_vce (gimple_stmt_iterator *gsi)
+ {
+   gimple stmt = gsi_stmt (*gsi);
+   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
+ 
+   /* Drop useless VIEW_CONVERT_EXPRs.  */
+   tree op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
+   if (useless_type_conversion_p (type, TREE_TYPE (op)))
+ {
+   gimple_assign_set_rhs1 (stmt, op);
+   update_stmt (stmt);
+   return true;
+ }
+ 
+   if (TREE_CODE (op) != SSA_NAME)
+ return false;
+ 
+   gimple def_stmt = SSA_NAME_DEF_STMT (op);
+   if (!is_gimple_assign (def_stmt))
+ return false;
+ 
+   tree def_op = gimple_assign_rhs1 (def_stmt);
+   switch (gimple_assign_rhs_code (def_stmt))
+ {
+ CASE_CONVERT:
+   /* Strip integral conversions that do not change the precision.  */
+   if ((INTEGRAL_TYPE_P (TREE_TYPE (op))
+  || POINTER_TYPE_P (TREE_TYPE (op)))
+  (INTEGRAL_TYPE_P (TREE_TYPE (def_op))
+ || POINTER_TYPE_P (TREE_TYPE (def_op)))
+  (TYPE_PRECISION (TREE_TYPE (op))
+ == TYPE_PRECISION (TREE_TYPE (def_op
+   {
+ TREE_OPERAND (gimple_assign_rhs1 (stmt), 0) = def_op;
+ update_stmt (stmt);
+ return true;
+   }
+   break;
+ 
+ case VIEW_CONVERT_EXPR:
+   /* Series of VIEW_CONVERT_EXPRs on register operands can
+be contracted.  */
+   if (TREE_CODE (TREE_OPERAND (def_op, 0)) == SSA_NAME)
+   {
+ if (useless_type_conversion_p (type,
+TREE_TYPE (TREE_OPERAND (def_op, 0
+   gimple_assign_set_rhs1 (stmt, TREE_OPERAND (def_op, 0));
+ else
+   TREE_OPERAND (gimple_assign_rhs1 (stmt), 0)
+   = TREE_OPERAND (def_op, 0);
+ update_stmt (stmt);
+ return true;
+   }
+ 
+ default:;
+ }
+ 
+   return false;
+ }
+ 
  /* Combine an element access with a shuffle.  Returns true if there were
 any changes made, else it returns false.  */
   
*** ssa_forward_propagate_and_combine (void)
*** 3491,3496 
--- 3554,3561 
  
changed = did_something != 0;
  }
+   else if (code == VIEW_CONVERT_EXPR)
+ changed = simplify_vce (gsi);
else if (code == VEC_PERM_EXPR)
  {
int did_something = simplify_permutation (gsi);


Re: [SH] PR 53976 - Add RTL pass to eliminate clrt, sett insns

2013-11-21 Thread Oleg Endo
Steven,

Thanks for the feedback.  I've committed the original patch as-is, but
I'm happy to improve it with follow up patches.

On Thu, 2013-11-21 at 00:04 +0100, Steven Bosscher wrote:
 On Wed, Nov 20, 2013 at 8:29 PM, Oleg Endo wrote:
 
  The attached patch adds another SH specific RTL pass which is supposed
  to optimize clrt and sett instruction usage.  As a start, it currently
  just eliminates redundant clrt and sett instructions in cases where the
  T bit value is known.  However, I'm planning on extending it a little in
  order to e.g. hoist clrt/sett insns out of loops etc.
 
 +#include vector

Is there something wrong with vector ?


  +#define log_msg(...)\
  +  do { if (dump_file != NULL) fprintf (dump_file, __VA_ARGS__); } while (0)
 
 Is that valid C++98, a varags macro?

No it's not.  It's C99 / C++11.  However, most compilers support
__VA_ARGS__.  If it causes too much trouble I'll remove it of course
(sh_treg_combine.cc would also be affected).
Having to write if (dump_file ...) stuff over and over again is
annoying and impacts the readability of code in my opinion.  So if the
__VA_ARGS__ usage has to go, there should be some more or less
alternative.  I think other passes would also benefit from that.
BTW something std::ostream like would be nice for logging, like...

  log_msg (updated ccreg mode: );
  log_rtx (m_ccreg);
  log_msg (\n\n);

... could be something like

log ( updated ccreg mode:   m_ccreg  \n\n);

where log is:
#define log(expr)\
  do { if (dump_file != NULL) logging ostream ref expr; } while (0)

Since there are various ways of printing an rtx (print_rtl,
print_rtl_single), this could be done with a wrapper object around the
logged rtx:

log ( got insn: \n  rtx::log_single (my_insn)  \n);

or iomanip style (requires custom ostream):

log ( got insn: \n  rtx::log_single  my_insn  \n);

... but I'm not sure how others think about that.  If this is of
interest I could do some work in that direction.

  +// Notice that the CFG might be invalid at late RTL stages and
  +// BLOCK_FOR_INSN might return null.  Thus the basic block are recorded
  +// here while traversing them.
  +basic_block bb;
 
 You insert your pass just before sched2. The CFG is perfectly fine at
 that stage. So you shouldn't need this. (And if you would need to
 record bb, then this solution wouldn't be GC-safe).

Why is that?  AFAIK GC is done after the pass has finished?  The pass
doesn't keep any state beyond the current pass execution.

 
 BLOCK_FOR_INSN will only be NULL for things that are not inside a
 basic block (some notes, deleted labels, etc.).
 
 That all said and done: AFAICT you don't actually use BLOCK_FOR_INSN
 anywhere :-)

Sorry for the confusion.  Initially I had it inserted right before
machine dependent reorg, at which point I was getting nullptr from
BLOCK_FOR_INSN all over the place.  So not using it was the fix.
However, only after I've noticed that SH's machine dependent reorg
leaves the basic block structure in a 'broken' state (PR 59189) and thus
I moved the pass before sched2.
I'll try using BLOCK_FOR_INSN again.

 
  +for (edge_iterator ei = ei_start (bb-preds); !ei_end_p (ei); ei_next 
  (ei))
 
 FOR_EACH_EDGE
 
 Declaring the edge_iterator inside the for() is not a good argument
 against FOR_EACH_EDGE. 

But that was the only reason! ;)
(I've done the same in sh_treg_combine.cc)
Can we ...

 Of course, brownie points are up for grabs for
 the brave soul daring enough to make edge iterators be proper C++
 iterators... ;-)

... fix this first and use the SH RTL passes as examples.  Then migrate
other existing code?  Or do you insist on FOR_EACH_EDGE usage for the
time being?

  +if (pred_bb-index == ENTRY_BLOCK)
 
 I used to dislike this idiom of checking bb-index against fixed block
 numbers. But now that ENTRY_BLOCK_PTR and EXIT_BLOCK_PTR actually
 require two pointer dereferences (cfun-cfg-...) it's the better
 option.
 
 /me adds to TODO list...

Actually I don't need any of that.  I will re-test the following
(although it's quite obvious):

Index: gcc/config/sh/sh_optimize_sett_clrt.cc
===
--- gcc/config/sh/sh_optimize_sett_clrt.cc  (revision 205191)
+++ gcc/config/sh/sh_optimize_sett_clrt.cc  (working copy)
@@ -309,9 +309,6 @@
  std::vectorccreg_value values_out,
  basic_block prev_visited_bb) const
 {
-  if (start_insn == NULL_RTX)
-return;
-
   log_msg (looking for ccreg values in [bb %d]\n, bb-index);
 
   for (rtx i = start_insn; i != NULL_RTX  i != PREV_INSN (BB_HEAD (bb));
@@ -376,9 +373,6 @@
 for (edge_iterator ei = ei_start (bb-preds); !ei_end_p (ei); ei_next 
(ei))
   {
basic_block pred_bb = ei_edge (ei)-src;
-   if (pred_bb-index == ENTRY_BLOCK)
- continue;
-
pred_bb_count += 1;
find_last_ccreg_values (BB_END (pred_bb), pred_bb, values_out, bb);
   }


 
  

Re: [PATCH] Improve { x, x + 3, x + 6, x + 9 } expansion (take 2)

2013-11-21 Thread Jakub Jelinek
On Thu, Nov 21, 2013 at 12:18:45PM +0100, Richard Biener wrote:
  Bootstrap/regtest pending, ok at least for this for the start and can be
  improved later on?
 
 Ok, this should catch most of the vectorizer cases.
 
 Zero could also be handled for PLUS_EXPR, likewise one for MULT_EXPR.
 I think for induction it's common to have { base, base + 1, base + 2, ... 

Of course I handle base for PLUS_EXPR (i.e. zero addend), what I meant
is that for MULT_EXPR, you can actually not have any base at all for
a subset of the elements, just constant 0, because when you multiply
arbitrary base with 0, you get 0.

 why loop here?  Do you want to catch base + 1 + 2?  I think that's
 hiding a missed optimization elsewhere for no good reason.

I had that in the patch first, unfortunately it is a pass ordering issue.
  stmp_var_.25_67 = x_27 + 3;
  stmp_var_.25_68 = stmp_var_.25_67 + 3;
  stmp_var_.25_69 = stmp_var_.25_68 + 3;
  stmp_var_.25_70 = stmp_var_.25_69 + 3;
  stmp_var_.25_71 = stmp_var_.25_70 + 3;
  stmp_var_.25_72 = stmp_var_.25_71 + 3;
  stmp_var_.25_73 = stmp_var_.25_72 + 3;
  vect_cst_.26_74 = {x_27, stmp_var_.25_67, stmp_var_.25_68, stmp_var_.25_69, 
stmp_var_.25_70, stmp_var_.25_71, stmp_var_.25_72, stmp_var_.25_73};
is exactly what I see in the last veclower pass, because there is no
forwprop between vect pass and veclower.  So, do you want to schedule
another forwprop before veclower?  Moving veclower later sounds bad,
we really need the stuff created by veclower cleaned up too.

Jakub


Re: [PATCH] Improve { x, x + 3, x + 6, x + 9 } expansion (take 2)

2013-11-21 Thread Richard Biener
On Thu, 21 Nov 2013, Jakub Jelinek wrote:

 On Thu, Nov 21, 2013 at 12:18:45PM +0100, Richard Biener wrote:
   Bootstrap/regtest pending, ok at least for this for the start and can be
   improved later on?
  
  Ok, this should catch most of the vectorizer cases.
  
  Zero could also be handled for PLUS_EXPR, likewise one for MULT_EXPR.
  I think for induction it's common to have { base, base + 1, base + 2, ... 
 
 Of course I handle base for PLUS_EXPR (i.e. zero addend), what I meant
 is that for MULT_EXPR, you can actually not have any base at all for
 a subset of the elements, just constant 0, because when you multiply
 arbitrary base with 0, you get 0.
 
  why loop here?  Do you want to catch base + 1 + 2?  I think that's
  hiding a missed optimization elsewhere for no good reason.
 
 I had that in the patch first, unfortunately it is a pass ordering issue.
   stmp_var_.25_67 = x_27 + 3;
   stmp_var_.25_68 = stmp_var_.25_67 + 3;
   stmp_var_.25_69 = stmp_var_.25_68 + 3;
   stmp_var_.25_70 = stmp_var_.25_69 + 3;
   stmp_var_.25_71 = stmp_var_.25_70 + 3;
   stmp_var_.25_72 = stmp_var_.25_71 + 3;
   stmp_var_.25_73 = stmp_var_.25_72 + 3;
   vect_cst_.26_74 = {x_27, stmp_var_.25_67, stmp_var_.25_68, stmp_var_.25_69, 
 stmp_var_.25_70, stmp_var_.25_71, stmp_var_.25_72, stmp_var_.25_73};
 is exactly what I see in the last veclower pass, because there is no
 forwprop between vect pass and veclower.  So, do you want to schedule
 another forwprop before veclower?  Moving veclower later sounds bad,
 we really need the stuff created by veclower cleaned up too.

Oh, indeed.  Bah.  That case makes the whole stuff quadratic, too ;)
For

typedef int vLARGEsi __attribute__((vector_size(1024*1024)));

(we seem to ICE with vector_size(1024*1024*1024) in stor-layout.c - heh)

Or do we split up the IL into vectors which have a mode before
optimizing the constructors like above?

That said, I'm fine with the patch as-is - we can look at some
reall-large-vectors testcases as followup (I'd expect we have
other issues with them ...)

Richard.


RE: [PATCH RFC] MIPS add support for MIPS SIMD ARCHITECTURE V1.07

2013-11-21 Thread Graham Stott
Hi Joseph,

Thanks for the comments I will address these issues and send an updated  patch.

Graham





Re: [PATCH] Improve { x, x + 3, x + 6, x + 9 } expansion (take 2)

2013-11-21 Thread Jakub Jelinek
On Thu, Nov 21, 2013 at 12:37:01PM +0100, Richard Biener wrote:
 Oh, indeed.  Bah.  That case makes the whole stuff quadratic, too ;)

True, O(nelts^2), but largest nelts we have right now is 64 (V64QImode
on -mavx512f).

 For
 
 typedef int vLARGEsi __attribute__((vector_size(1024*1024)));

The optimization is done only for VECTOR_MODE_P CONSTRUCTORs, if
there is no HW support, the vector will live in memory and the optimization
doesn't make sense.

Jakub


Re: [fortran, patch] Add Fortran 2003 IEEE intrinsic modules

2013-11-21 Thread Tobias Burnus
Hi FX,

some first remarks.

FX wrote:
 I would like to get testing from:
   – a Solaris target (to test config/fpu-sysv.h)
  – an AIX target (to test config/fpu-aix.h)

For AIX, you could ask David Edelsohn for review.

For x87/SSE, you could ask Uros Bizjak for review.

For SysV you could ask Gerald Pfeifer (FreeBSD) and Eric Botcazou (Solaris)
for testing. (As libgfortran/config patching has shown, the two systems can
have different definitions.)


 --- gcc/testsuite/lib/target-supports.exp (revision 205019)
 +++ gcc/testsuite/lib/target-supports.exp (working copy)
  proc add_options_for_ieee { flags } {
...
 +set extra -fno-unsafe-math-optimizations -frounding-math 
 -fsignaling-nans -fintrinsic-modules-path $specpath/libgfortran/

That part looks wrong: I think you do not want to add -fintrinsic-modules-path
for all IEEE functions, e.g. C and C++ compilers do not handle that option,
nor does the Ada compiler.

You could also ask Mike Stump to review the testsuite changes.


 --- libgfortran/gfortran.map  (revision 205019)
 +++ libgfortran/gfortran.map  (working copy)
 @@ -1193,6 +1193,97 @@ GFORTRAN_1.5 {
   global:
 _gfortran_ftell2;
 _gfortran_backtrace;
+_gfortran_ieee_copy_sign_4_4_;


GFORTRAN_1.5 was used by GCC 4.8. You should start a new section
when you want to add symbols for a new release.

Tobias


Re: [fortran, patch] Add Fortran 2003 IEEE intrinsic modules

2013-11-21 Thread FX
 --- gcc/testsuite/lib/target-supports.exp(revision 205019)
 +++ gcc/testsuite/lib/target-supports.exp(working copy)
 proc add_options_for_ieee { flags } {
 ...
 +set extra -fno-unsafe-math-optimizations -frounding-math 
 -fsignaling-nans -fintrinsic-modules-path $specpath/libgfortran/
 
 That part looks wrong: I think you do not want to add -fintrinsic-modules-path
 for all IEEE functions, e.g. C and C++ compilers do not handle that option,
 nor does the Ada compiler.

Hum. That’s unfortunate, because I haven’t found any other suitable place :)
I do not see how to specify compiler flags only for Fortran.

 You could also ask Mike Stump to review the testsuite changes.

Mike, in your understanding, is there any place where Fortran-only flags could 
be specified in the testsuite?

FX

Re: [fortran, patch] Add Fortran 2003 IEEE intrinsic modules

2013-11-21 Thread Mike Stump
On Nov 21, 2013, at 3:58 AM, FX fxcoud...@gmail.com wrote:
 --- gcc/testsuite/lib/target-supports.exp   (revision 205019)
 +++ gcc/testsuite/lib/target-supports.exp   (working copy)
 proc add_options_for_ieee { flags } {
 ...
 +set extra -fno-unsafe-math-optimizations -frounding-math 
 -fsignaling-nans -fintrinsic-modules-path $specpath/libgfortran/
 
 That part looks wrong: I think you do not want to add 
 -fintrinsic-modules-path
 for all IEEE functions, e.g. C and C++ compilers do not handle that option,
 nor does the Ada compiler.
 
 Hum. That’s unfortunate, because I haven’t found any other suitable place :)
 I do not see how to specify compiler flags only for Fortran.
 
 You could also ask Mike Stump to review the testsuite changes.
 
 Mike, in your understanding, is there any place where Fortran-only flags 
 could be specified in the test suite?

But you're doing such a fine job!

Re: [fortran, patch] Add Fortran 2003 IEEE intrinsic modules

2013-11-21 Thread Uros Bizjak
Hello!

 Here’s my patch submission for adding IEEE intrinsic modules (following 
 Fortran 2003 and 2008
 standards) to gfortran. It implements the item 1, and part of item 2, of my 
 initial plan [1]. All the
 IEEE modules, types, named constants, procedures are defined and fully 
 working. The patch
 comes of course with plenty of testcases, and I can add some more if you can 
 think of things I’ve
  forgotten. I’ve bootstrapped and regtested the patch on:

  __asm__ __volatile__ (fnclex\n\tfldcw\t%0 : : m (cw));

@@ -136,16 +165,54 @@ set_fpu (void)
   __asm__ __volatile__ (%vstmxcsr\t%0 : =m (cw_sse));

   /* The SSE exception masks are shifted by 7 bits.  */
-  cw_sse |= _FPU_MASK_ALL  7;
-  cw_sse = ~(excepts  7);
-
-  /* Clear stalled exception flags.  */
-  cw_sse = ~_FPU_EX_ALL;

You have to clear stalled SSE exceptions here. Their flags are in LSB
bits, so their position is different than the position of exception
mask bits in the control word.

+  /* Change the flags. This is tricky on 387 (unlike SSE), because we have
+ FNSTSW but no FLDSW instruction.  */
+  __asm__ __volatile__ (fnstenv\t%0 : =m (*temp));
+
+  temp.__status_word = ~exc_clr;
+  temp.__status_word |= exc_set;
+
+  __asm__ __volatile__ (fldenv\t%0 : : m (*temp));

Why do you need * here?

fldenv will also trigger exceptions with set flags on the next x87 FP insn ...

+__asm__ __volatile__ (%vstmxcsr\t%0 : =m (cw_sse));
+
+cw_sse = ~exc_clr;
+cw_sse |= exc_set;
+
+__asm__ __volatile__ (%vldmxcsr\t%0 : : m (cw_sse));

... and ldmxcsr won't trigger exceptions, neither with SSE insn.
Please see Intel documentation on FP exceptions.

Uros.


Re: [PATCH] libstdc++ testsuite cxxflags

2013-11-21 Thread Jonathan Wakely
On 20 November 2013 23:57, Cesar Philippidis wrote:
 On 11/20/13, 1:46 PM, Jonathan Wakely wrote:
 On 20 November 2013 21:44, Jonathan Wakely wrote:
 On 29 October 2013 15:37, Cesar Philippidis wrote:
 This patch addresses two issues with the libstdc++ testsuite:

   * duplicate -g -O2 CXXFLAGS
   * missing -g -O2 for remote targets

 The duplicate -g -O2 flags is a result of testsuite_flags.in using
 build-time CXXFLAGS and proc libstdc++_init using the environmental
 CXXFLAGS, which defaults to its build-time value. This patch prevents
 testsuite_flags.in from using build-time CXXFLAGS.

 Certain remote targets require a minimum optimization level -O1 in order
 to pass several atomics built-in function tests. This patch ensures
 cxxflags contains -g -O2 at minimum when no other optimization flags
 are specified. The testsuite used to set those flags prior to Benjamin's
 patch to remove duplicate cxxflags here
 http://gcc.gnu.org/ml/gcc-patches/2012-03/msg01572.html.

 Is this OK for trunk? If so, please apply (I don't have commit rights).

 I think so ... although I'm not sure I've got my head round the
 effects in all cases!

 Sorry, I didn't realise gmail thought Ctrl-Enter meant send. I meant
 to ask a couple of questions about it ...

 Is removing EXTRA_CXX_FLAGS necessary too?

 I looked at it again, and it seems to be OK to leave it in there.

 For remote targets, if CXXFLAGS is set in the env can -g still end up 
 missing?

 No, but CXXFLAGS isn't necessarily set in the env. Specifically, if you
 run the testsuite without using the makefile, the CXXFLAGS may not be set.

 I revised the patch to preserve @EXTRA_CXX_FLAGS@. I also append the
 '-g' flag with '-O2', since the '-g' isn't as important in the testsuite
 as '-O2'.

 Is this patch OK? Is so, please commit it because I do not have an svn
 account.

I've been playing around with this patch and CXXFLAGS further, and I'm
not sure about it now.

What harm do the duplicate flags do? If you want different flags to be
used when running the testsuite you can set CXXFLAGS, which will come
later on the command-line and so take precedence. However, if we
remove -g -O2 from CXXFLAGS_config and you use CXXFLAGS=-DFOO when
running the testsuite then after this change you won't get the same
result, you'd have to change to use CXXFLAGS=-g -O2 -DFOO

Is that really what we want?


Re: [PATCH, MPX, 2/X] Pointers Checker [14/25] Function splitting

2013-11-21 Thread Richard Biener
On Thu, Nov 21, 2013 at 10:31 AM, Ilya Enkovich enkovich@gmail.com wrote:
 2013/11/20 Richard Biener richard.guent...@gmail.com:
 On Wed, Nov 20, 2013 at 10:57 AM, Richard Biener
 richard.guent...@gmail.com wrote:
 On Tue, Nov 19, 2013 at 9:18 PM, Ilya Enkovich enkovich@gmail.com 
 wrote:
 2013/11/19 Jeff Law l...@redhat.com:
 On 11/19/13 05:20, Ilya Enkovich wrote:

 2013/11/19 Richard Biener richard.guent...@gmail.com:

 On Mon, Nov 18, 2013 at 8:12 PM, Ilya Enkovich enkovich@gmail.com
 wrote:

 2013/11/18 Jeff Law l...@redhat.com:

 On 11/18/13 11:27, Ilya Enkovich wrote:



 How does pointer passed to regular function differ from pointer 
 passed
 to splitted function? How do I know then which pointer is to be 
 passed
 with bounds and wchich one is not? Moreover current ABI does not 
 allow
 to pass bounds with no pointer or pass bounds for some pointers in 
 the
 call only.


 But I don't see any case in function splitting where we're going to
 want to
 pass the pointer without the bounds.  If you want the former, you're
 going
 to want the latter.


 There are at least cases when checks are eliminated or when lots of
 pointer usages are accompanied with few checks performed earlier (e.g.
 we are working with array). In such cases splitted part may easily get
 no bounds.


 I really don't see why you need to do anything special here.  At the
 most an
 assert in the splitting code to ensure that you don't have a situation
 where
 there's mixed pointers with bounds and pointers without bounds should
 be all
 you need or that you passed a bounds with no associated pointer :-)


 It would also require generation of proper bind_bounds calls in the
 original function and arg_bounds calls in a separated part. So,
 special support is required.


 Well, only to keep proper instrumentation.  I hope code still works
 (doesn't trap) when optimizations wreck the bounds?  Thus all
 these patches are improving bounds propagation but are not required
 for correctness?  If so please postpone all of them until after the
 initial support is merged.  If not, please make sure BND instrumentation
 works conservatively when optimizations wreck it.


 All patches I sent for optimization passes are required to avoid ICEs
 when compiling instrumented code.

 Then I think we're going to need to understand them in more detail. That's
 going to mean testcases, probably dumps and some commentary about what 
 went
 wrong.

 I can't speak for Richi, but when optimizations get disabled, I tend to 
 want
 to really understand why and make sure we're not papering over a larger
 problem.

 The tail recursion elimination one we're discussing now is a great 
 example.
 At this point I understand the problem you're running into, but I'm still
 trying to wrap my head around the implications of the funny semantics of
 __builtin_arg_bounds and how they may cause other problems.

 Root of all problems if implicit data flow hidden in arg_bounds and
 bind_bounds.  Calls consume bounds and compiler does not know it. And
 input bounds are always expressed via arg_bounds calls and never
 expressed via formal args. Obviously optimizers have to be taught
 about these data dependencies to work correctly.

 I agree semantics of arg_bounds call creates many issues for
 optimizers but currently I do not see a better replacement for it.

 But it looks incredibly fragile if you ICE once something you don't like
 happens.  You should be able to easily detect the case and punt,
 that is, drop to non-instrumented aka invalidating bounds.

 Thus, I really really don't like these patches.  They hint at some
 deeper problem with the overall design (or the HW feature or the
 accompaning ABI).

 Note that this, the intrusiveness of the feature and the questionable
 gain makes me question whether GCC should have support for this
 feature (and whether we really should rush this in this late).

 Thus, I hereby formally ask to push back this feature to 4.10.

 I think you overestimate the intrusiveness of the checker. Necessity
 of changes in optimization passes is artificial and is used to get
 maximum checking quality. It can be easily made immune for different
 code transformation by simple changes in the process of
 instrumentation expand (I have a fix for that already). With that
 change only pass itself, support for bound args during expand, support
 in i386 target and minor infrastructure changes are required (e.g.
 flag in varpool_node, bounds_constants). Changes in inline,
 propagation, SRA, tail recursion, strlen, function splitting, string
 function builtins expand would become optional and affect checking
 quality only.

 Also note that all changes do not affect compilation process when no
 instrumentation is used.

 Please reconsider your decision about pushing it to 4.10 taking that
 into account.

The point is that we are still pondering over the design and stage1
is basically over.

Richard.

 Thanks,
 Ilya


 Thanks,
 Richard.

 Richard.

Re: [PATCH] Updated automated patch (was Re: [PATCH 3/6] Automated part of conversion of gimple types to use C++ inheritance)

2013-11-21 Thread Jakub Jelinek
On Mon, Nov 18, 2013 at 03:25:52PM -0500, David Malcolm wrote:
  So is there some reason the GIMPLE_CHECK was left in here rather than 
  doing the downcasting?  This happens in other places.

Note that the changes removed tons of checks that IMHO were desirable.
The as_a that replaced those checks e.g. allows 3 different gimple codes,
while previously only one was allowed, this is both more expensive for
--enable-checking=yes, and allows one to use inline wrappers e.g.
gimple_omp_parallel_something on GIMPLE_OMP_TASK etc.

Jakub


Re: [PATCH] Builtins handling in IVOPT

2013-11-21 Thread Wei Mi
 So what you are doing is basically not only rewriting memory references
 to possibly use TARGET_MEM_REF but also address uses to use
 TARGET_MEM_REF.  I think this is a good thing in general
 (given instructions like x86 lea) and I would not bother distinguishing
 the different kind of uses.

 Richard.


You mean to change normal expr to TMR(expr) form in order to utilize
x86 lea type instructions as much as possible. It is interesting. I
can experiment that idea later. I am not sure if it could simply work.
My concern is x86 lea still has some limitation (such as three
operands lea will have longer latency and can only be issued to
port1), if we change some expr to TMR(expr), will it inhitbit cse
opportunity if codegen find out it is not good to use lea?

Thanks,
Wei.


Re: [PATCH] Builtins handling in IVOPT

2013-11-21 Thread Richard Biener
Wei Mi w...@google.com wrote:
 So what you are doing is basically not only rewriting memory
references
 to possibly use TARGET_MEM_REF but also address uses to use
 TARGET_MEM_REF.  I think this is a good thing in general
 (given instructions like x86 lea) and I would not bother
distinguishing
 the different kind of uses.

 Richard.


You mean to change normal expr to TMR(expr) form in order to utilize
x86 lea type instructions as much as possible. It is interesting. I
can experiment that idea later. I am not sure if it could simply work.
My concern is x86 lea still has some limitation (such as three
operands lea will have longer latency and can only be issued to
port1), if we change some expr to TMR(expr), will it inhitbit cse
opportunity if codegen find out it is not good to use lea?

That needs to be determined.  Over all it might be because ivopts runs so 
early.  At rtl level there should not be big differences apart from better 
initial address computations.

Did I misunderstand what your patch does?

Richard.

Thanks,
Wei.




Re: [patch] PR 59195: C++ demangler handles conversion operator incorrectly

2013-11-21 Thread Cary Coutant
I've made a small revision to this patch to handle recursive
invocations of d_expression and d_operator_name, restoring the
previous values of is_expression and is_conversion instead of just
setting them to 0 upon return. I've also added the long test case that
results in a substitution misnumbering in the current demangler.

-cary


 2013-11-19  Cary Coutant  ccout...@google.com

 libiberty/
 PR other/59195
 * cp-demangle.c (struct d_info_checkpoint): New struct.
 (struct d_print_info): Add current_template field.
 (d_operator_name): Set flag when processing a conversion
 operator.
 (cplus_demangle_type): When processing template-args for
 a conversion operator, backtrack if necessary.
 (d_expression_1): Renamed from d_expression.
 (d_expression): New wrapper around d_expression_1.
 (d_checkpoint): New function.
 (d_backtrack): New function.
 (d_print_init): Initialize current_template.
 (d_print_comp): Set current_template.
 (d_print_cast): Put current_template in scope for
 printing conversion operator name.
 (cplus_demangle_init_info): Initialize is_expression and
 is_conversion.
 * cp-demangle.h (struct d_info): Add is_expression and
 is_conversion fields.
 * testsuite/demangle-expected: New test cases.
commit 498efd2d720b48641fe0142295f19438601ea2f1
Author: Cary Coutant ccout...@google.com
Date:   Wed Nov 13 09:28:58 2013 -0800

Fix demangler to handle conversion operators correctly.

2013-11-19  Cary Coutant  ccout...@google.com

libiberty/
PR other/59195
* cp-demangle.c (struct d_info_checkpoint): New struct.
(struct d_print_info): Add current_template field.
(d_operator_name): Set flag when processing a conversion
operator.
(cplus_demangle_type): When processing template-args for
a conversion operator, backtrack if necessary.
(d_expression_1): Renamed from d_expression.
(d_expression): New wrapper around d_expression_1.
(d_checkpoint): New function.
(d_backtrack): New function.
(d_print_init): Initialize current_template.
(d_print_comp): Set current_template.
(d_print_cast): Put current_template in scope for
printing conversion operator name.
(cplus_demangle_init_info): Initialize is_expression and
is_conversion.
* cp-demangle.h (struct d_info): Add is_expression and
is_conversion fields.
* testsuite/demangle-expected: New test cases.

diff --git a/libiberty/cp-demangle.c b/libiberty/cp-demangle.c
index cbe4d8c..029151e 100644
--- a/libiberty/cp-demangle.c
+++ b/libiberty/cp-demangle.c
@@ -287,6 +287,19 @@ struct d_saved_scope
   struct d_print_template *templates;
 };
 
+/* Checkpoint structure to allow backtracking.  This holds copies
+   of the fields of struct d_info that need to be restored
+   if a trial parse needs to be backtracked over.  */
+
+struct d_info_checkpoint
+{
+  const char *n;
+  int next_comp;
+  int next_sub;
+  int did_subs;
+  int expansion;
+};
+
 enum { D_PRINT_BUFFER_LENGTH = 256 };
 struct d_print_info
 {
@@ -318,6 +331,8 @@ struct d_print_info
   struct d_saved_scope *saved_scopes;
   /* Number of saved scopes in the above array.  */
   int num_saved_scopes;
+  /* The nearest enclosing template, if any.  */
+  const struct demangle_component *current_template;
 };
 
 #ifdef CP_DEMANGLE_DEBUG
@@ -444,6 +459,10 @@ d_add_substitution (struct d_info *, struct 
demangle_component *);
 
 static struct demangle_component *d_substitution (struct d_info *, int);
 
+static void d_checkpoint (struct d_info *, struct d_info_checkpoint *);
+
+static void d_backtrack (struct d_info *, struct d_info_checkpoint *);
+
 static void d_growable_string_init (struct d_growable_string *, size_t);
 
 static inline void
@@ -1734,8 +1753,15 @@ d_operator_name (struct d_info *di)
   if (c1 == 'v'  IS_DIGIT (c2))
 return d_make_extended_operator (di, c2 - '0', d_source_name (di));
   else if (c1 == 'c'  c2 == 'v')
-return d_make_comp (di, DEMANGLE_COMPONENT_CAST,
-   cplus_demangle_type (di), NULL);
+{
+  struct demangle_component *type;
+  int was_conversion = di-is_conversion;
+
+  di-is_conversion = ! di-is_expression;
+  type = cplus_demangle_type (di);
+  di-is_conversion = was_conversion;
+  return d_make_comp (di, DEMANGLE_COMPONENT_CAST, type, NULL);
+}
   else
 {
   /* LOW is the inclusive lower bound.  */
@@ -2284,13 +2310,61 @@ cplus_demangle_type (struct d_info *di)
   ret = d_template_param (di);
   if (d_peek_char (di) == 'I')
{
- /* This is template-template-param template-args.  The
-template-template-param part is a substitution
+ /* This may be template-template-param template-args.
+If this is the type for a conversion 

[PATCH][PR tree-optimization/59221] Fix temporary equivalence handling

2013-11-21 Thread Jeff Law


A recent change to the thread discovery code was missing a small, but 
important hunk.  Namely management of the temporary equivalences. 
Amazingly, this didn't cause any bootstrapping problems on x86, but 
Zhendong has a good testcase and it may be causing problems on Sparc.


In a nutshell, we were failing to wipe temporary equivalences when 
processing the successors of a joiner block.  Egad.  It'd actually been 
like this for a while, but the new code makes the oversight more obvious 
and painful.


Bootstrapped and regression tested on x86_64-unknown-linux-gnu. 
Obviously it fixes Zhendong's tesetcase as well.  Installed on the 
trunk.  Sorry for the breakage.


Jeff

PR tree-optimization/59221
* tree-ssa-threadedge.c (thread_across_edge): Properly manage
temporary equivalences when threading through joiner blocks.

PR tree-optimization/59221
* gcc.c-torture/execute/pr59221.c: New test.

diff --git a/gcc/testsuite/gcc.c-torture/execute/pr59221.c 
b/gcc/testsuite/gcc.c-torture/execute/pr59221.c
new file mode 100644
index 000..0cd4259
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/pr59221.c
@@ -0,0 +1,19 @@
+
+
+int a = 1, b, d;
+short e;
+
+int
+main ()
+{
+  for (; b; b++)
+;
+  short f = a;
+  int g = 15;
+  e = f ? f : 1  g;
+  int h = e;
+  d = h == 83647 ? 0 : h;
+  if (d != 1)
+__builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
index 7bb8829..a144875 100644
--- a/gcc/tree-ssa-threadedge.c
+++ b/gcc/tree-ssa-threadedge.c
@@ -1072,6 +1072,10 @@ thread_across_edge (gimple dummy_cond,
 /* Look at each successor of E-dest to see if we can thread through it.  
*/
 FOR_EACH_EDGE (taken_edge, ei, e-dest-succs)
   {
+   /* Push a fresh marker so we can unwind the equivalences created
+  for each of E-dest's successors.  */
+   stack-safe_push (NULL_TREE);
+ 
/* Avoid threading to any block we have already visited.  */
bitmap_clear (visited);
bitmap_set_bit (visited, taken_edge-dest-index);
@@ -1118,6 +1122,9 @@ thread_across_edge (gimple dummy_cond,
  {
delete_jump_thread_path (path);
  }
+
+   /* And unwind the equivalence table.  */
+   remove_temporary_equivalences (stack);
   }
 BITMAP_FREE (visited);
   }


Re: [PATCH] Updated automated patch (was Re: [PATCH 3/6] Automated part of conversion of gimple types to use C++ inheritance)

2013-11-21 Thread Jeff Law

On 11/21/13 15:19, Jakub Jelinek wrote:

On Mon, Nov 18, 2013 at 03:25:52PM -0500, David Malcolm wrote:

So is there some reason the GIMPLE_CHECK was left in here rather than
doing the downcasting?  This happens in other places.


Note that the changes removed tons of checks that IMHO were desirable.
The as_a that replaced those checks e.g. allows 3 different gimple codes,
while previously only one was allowed, this is both more expensive for
--enable-checking=yes, and allows one to use inline wrappers e.g.
gimple_omp_parallel_something on GIMPLE_OMP_TASK etc.

Can you give a couple examples, please?

jeff


[PATCH] Fix various power8 tests that I wrote

2013-11-21 Thread Michael Meissner
With my changes for PR 59054, it broke several of the ISA 2.07 tests that I
added due to DImode not being allowed in Altivec/VMX registers.  This patch
adjusts these tests to reflect the current code generation.  In addition, it
looks like I checked in the test for pr 59054 with the code duplicated.

These changes cause all of these tests to now pass.  Are they ok to check in?

2013-11-21  Michael Meissner  meiss...@linux.vnet.ibm.com

PR target/59054
* gcc.target/powerpc/direct-move.h (VSX_REG_ATTR): Allow test to
specify an appropriate register class for VSX operations.
(load_vsx): Use it.
(load_gpr_to_vsx): Likewise.
(load_vsx_to_gpr): Likewise.
* gcc.target/powerpc/direct-move-vint1.c: Use an appropriate
register class for VSX registers that the type can handle.  Remove
checks for explicit number of instructions generated, just check
if the instruction is generated.
* gcc.target/powerpc/direct-move-vint2.c: Likewise.
* gcc.target/powerpc/direct-move-float1.c: Likewise.
* gcc.target/powerpc/direct-move-float2.c: Likewise.
* gcc.target/powerpc/direct-move-double1.c: Likewise.
* gcc.target/powerpc/direct-move-double2.c: Likewise.
* gcc.target/powerpc/direct-move-long1.c: Likewise.
* gcc.target/powerpc/direct-move-long2.c: Likewise.

* gcc.target/powerpc/pr59054.c: Remove duplicate code.

* gcc.target/powerpc/bool3-av.c: Limit to 64-bit mode for now.
* gcc.target/powerpc/bool3-p7.c: Likewise.
* gcc.target/powerpc/bool3-p8.c: Likewise.

* gcc.target/powerpc/p8vector-ldst.c: Just check that the
appropriate instructions are generated, don't check the count.

-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460, USA
email: meiss...@linux.vnet.ibm.com, phone: +1 (978) 899-4797
Index: gcc/testsuite/gcc.target/powerpc/direct-move-vint1.c
===
--- gcc/testsuite/gcc.target/powerpc/direct-move-vint1.c(revision 
205140)
+++ gcc/testsuite/gcc.target/powerpc/direct-move-vint1.c(working copy)
@@ -3,11 +3,12 @@
 /* { dg-skip-if  { powerpc*-*-*spe* } { * } {  } } */
 /* { dg-require-effective-target powerpc_p8vector_ok } */
 /* { dg-options -mcpu=power8 -O2 } */
-/* { dg-final { scan-assembler-times mtvsrd 4 } } */
-/* { dg-final { scan-assembler-times mfvsrd 4 } } */
+/* { dg-final { scan-assembler mtvsrd } } */
+/* { dg-final { scan-assembler mfvsrd } } */
 
-/* Check code generation for direct move for long types.  */
+/* Check code generation for direct move for vector types.  */
 
 #define TYPE vector int
+#define VSX_REG_ATTR wa
 
 #include direct-move.h
Index: gcc/testsuite/gcc.target/powerpc/pr59054.c
===
--- gcc/testsuite/gcc.target/powerpc/pr59054.c  (revision 205140)
+++ gcc/testsuite/gcc.target/powerpc/pr59054.c  (working copy)
@@ -4,15 +4,3 @@
 /* { dg-options -mcpu=power7 -O0 -m64 } */
 
 long foo (void) { return 0; }
-
-/* { dg-final { scan-assembler-not xxlor } } */
-/* { dg-final { scan-assembler-not stfd } } */
-/* { dg-do compile { target { powerpc*-*-*  lp64 } } } */
-/* { dg-skip-if  { powerpc*-*-darwin* } { * } {  } } */
-/* { dg-require-effective-target powerpc_vsx_ok } */
-/* { dg-options -mcpu=power7 -O0 -m64 } */
-
-long foo (void) { return 0; }
-
-/* { dg-final { scan-assembler-not xxlor } } */
-/* { dg-final { scan-assembler-not stfd } } */
Index: gcc/testsuite/gcc.target/powerpc/direct-move-vint2.c
===
--- gcc/testsuite/gcc.target/powerpc/direct-move-vint2.c(revision 
205140)
+++ gcc/testsuite/gcc.target/powerpc/direct-move-vint2.c(working copy)
@@ -8,5 +8,6 @@
 
 #define TYPE vector int
 #define DO_MAIN
+#define VSX_REG_ATTR wa
 
 #include direct-move.h
Index: gcc/testsuite/gcc.target/powerpc/bool3-p7.c
===
--- gcc/testsuite/gcc.target/powerpc/bool3-p7.c (revision 205140)
+++ gcc/testsuite/gcc.target/powerpc/bool3-p7.c (working copy)
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-do compile { target { powerpc*-*-*  lp64 } } } */
 /* { dg-skip-if  { powerpc*-*-darwin* } { * } {  } } */
 /* { dg-require-effective-target powerpc_vsx_ok } */
 /* { dg-options -O2 -mcpu=power7 } */
Index: gcc/testsuite/gcc.target/powerpc/direct-move.h
===
--- gcc/testsuite/gcc.target/powerpc/direct-move.h  (revision 205140)
+++ gcc/testsuite/gcc.target/powerpc/direct-move.h  (working copy)
@@ -3,6 +3,10 @@
 #include math.h
 extern void abort (void);
 
+#ifndef VSX_REG_ATTR
+#define VSX_REG_ATTR wa
+#endif
+
 void __attribute__((__noinline__))
 copy (TYPE *a, TYPE *b)
 {
@@ -44,7 +48,7 @@ void __attribute__((__noinline__))
 

Re: [fortran, patch] Add Fortran 2003 IEEE intrinsic modules

2013-11-21 Thread Joseph S. Myers
On Thu, 21 Nov 2013, N.M. Maclaren wrote:

 On Nov 21 2013, Joseph S. Myers wrote:
  On Thu, 21 Nov 2013, FX wrote:
  
   Indeed, 387/SSE has flush-to-zero modes. But other APIs do not (glibc,
   SysV, AIX).
  
  Note that glibc libm functions may not work when called in a flush-to-zero
  mode, only in modes that can be established by the fenv.h functions.
 
 Well, that's two clear bugs :-(
 
 If, as you say, they may not work at all in combination with -ffast-math,
 that's one.

What does work with -ffast-math is inherently poorly defined - there's a 
general expectation that things work with well-behaved arguments, with a 
requirement that functions not be called with arguments that would result 
in overflow or underflow (or with arguments needing special care to avoid 
internal overflows or underflows, or very large arguments to trig 
functions, for example).

 Setting __STDC_IEC_559__ to 1 in combination with -ffast-math is another,
 given that C99 and C11 reference ISO/IEC 10559 (1989).

Given glibc 2.19 and GCC 4.9 (neither yet released), __STDC_IEC_559__ 
should no longer be defined if -ffast-math, or other options indicating a 
user intent contrary to Annex F, is used.

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: Implement C11 _Atomic

2013-11-21 Thread Hans-Peter Nilsson
On Thu, 21 Nov 2013, Andrew MacLeod wrote:
  Or is that part also required for
  anything-other-than-ordinary-C-type alignment for the target;
  say, natural 4-byte alignment of 4-byte-types for targets where
  alignment is otherwise packed; where only 1-byte alignment of
  the basic type is ABI-mandated?
 
 If I understand correctly, yes, it would be needed there as well if the
 compiler thinks alignof (int) is 1, then I believe it will set alignof(_Atomic
 int) to 1 as well, and that's probably not good.

Right.

 Basically, atomic_types are given their own type nodes in tree.c:
   atomicQI_type_node = build_atomic_base (unsigned_intQI_type_node);
   atomicHI_type_node = build_atomic_base (unsigned_intHI_type_node);
   atomicSI_type_node = build_atomic_base (unsigned_intSI_type_node);
   atomicDI_type_node = build_atomic_base (unsigned_intDI_type_node);
   atomicTI_type_node = build_atomic_base (unsigned_intTI_type_node);

It sounds like I should be able to hack that from the port in
some other tree initializer hook?

I'm trying to avoid ABI breakage of course.  I'd rather not have
to ask people not to use _Atomic with 4.9 for CRIS ports using
official releases or have ABI breakage with the next release.
Maybe there's one other port in the same situation...

 and on the branch that code instead looks something like:

 #define SET_ATOMIC_TYPE_NODE(TYPE, MODE, DEFAULT)   \
  (TYPE) = build_atomic_base (DEFAULT, targetm.atomic_align_for_mode (MODE));

   SET_ATOMIC_TYPE_NODE (atomicQI_type_node, QImode, unsigned_intQI_type_node);
 ...

 which provides a target hook to override the default values and a target can
 set them to whatever it deems necessary.

Yah, that'd be nice.  Doesn't sound like more than the target
hook and the patched lines above left for that to happen,
though?  Or perhaps that's a too-naive assumption. I guess I
should have a look.

 There was insufficient time to test and fully flush this out, so it hasn't
 made it into mainline.  Its only thanks to Josephs heroic efforts we have C11
 :-)

 I don't think its a lot of code if you wanted to fool with it for your port.

So, what would be needed in terms of testing and coding to get
that part into 4.9?

brgds, H-P


RE: _Cilk_spawn and _Cilk_sync for C++

2013-11-21 Thread Iyer, Balaji V
Hi Jason,
Please see my responses below. I have also attached a fixed patch and 
the Changelog entries are cut and pasted below.

 -Original Message-
 From: Jason Merrill [mailto:ja...@redhat.com]
 Sent: Thursday, November 21, 2013 1:59 PM
 To: Iyer, Balaji V; gcc-patches@gcc.gnu.org
 Cc: Jeff Law
 Subject: Re: _Cilk_spawn and _Cilk_sync for C++
 
 On 11/17/2013 10:19 PM, Iyer, Balaji V wrote:
 cp/cp-cilkplus.o \
  - cp/cp-gimplify.o cp/cp-array-notation.o cp/lambda.o \
  + cp/cp-gimplify.o cp/cp-array-notation.o cp/lambda.o cp/cp-cilk.o \
 
 It seems unnecessary to have both cp-cilk.c and cp-cilkplus.c.  Please
 combine them.

Fixed. I removed cp-cilk.c and moved my work to cp-cilkplus.c. This will change 
my _Cilk_for for C++ work also, since I used cp-cilk.c to store all my 
routines. I will send out a fixed patch soon.

 
  +  extern tree do_begin_catch (void);
  +  extern tree do_end_catch (tree);
 
 If you want to use these, they need to be declared in cp-tree.h, not within
 another function.  Or better yet, factor out this code:
 

Fixed. I created a new function called cilk_create_try_catch () in cp/except.c

  +  append_to_statement_list (do_begin_catch (), catch_list);
  +  append_to_statement_list (build_throw (NULL_TREE), catch_list);
  +  tree catch_tf_expr = build_stmt (EXPR_LOCATION (body),
 TRY_FINALLY_EXPR,
  +  catch_list, do_end_catch (NULL_TREE));
  +  catch_list = build2 (CATCH_EXPR, void_type_node, NULL_TREE,
  +  catch_tf_expr);
  +  tree try_catch_expr = build_stmt (EXPR_LOCATION (body),
 TRY_CATCH_EXPR,
  +   body, catch_list);
 
 ...into a function in cp/except.c.
 

Yep, this is what I did.

  +  tree try_finally_expr = build_stmt (EXPR_LOCATION (body),
  + TRY_FINALLY_EXPR,
  +try_catch_expr, dtor);
  +  append_to_statement_list (try_finally_expr, list);
  +}
  +  else
  +append_to_statement_list (build_stmt (EXPR_LOCATION (body),
  + TRY_FINALLY_EXPR, body, dtor),
 list);
 
 This bit could be shared between the two branches.

Fixed. 

 
  +  /* When Cilk Plus is enabled, the lambda function need to be stored to
  + a variable because if the function is spawned, then we need some kind
  + of a handle.  */
  +  if (flag_enable_cilkplus  cxx_dialect = cxx0x
  +   TREE_CODE (fn) != VAR_DECL  TREE_CODE (fn) != OVERLOAD
  +   TREE_CODE (fn) != FUNCTION_DECL)
  +fn = cilk_create_lambda_fn_tmp_var (fn);
 
 I don't like making this change here.  What do you need a handle for?
 Why can't build_cilk_spawn deal with it?
 

The reason is that, when you have something like this:

_Cilk_spawn [=]  { body } ();

I need to capture the function call (which in this case is the whole function) 
and throw it into a nested function.  The nested function implementation is 
shared with C. If the function is stored in a variable then I can just send 
that out to the nested function. I have added another constraint to make sure 
the function is a spawning function, this way we can reduce more cases were 
they are stored to a variable. The reason why I added this check in 
finish_call_expr is that it seemed to be most straight-forward for me and only 
place where I could do with least disruption (code-changes).

  +case CILK_SPAWN_STMT:
  +  if (!potential_constant_expression_1 (CILK_SPAWN_FN (t), true,
 flags))
  +   return false;
  +  return true;
 
 Isn't Cilk spawn itself is non-constant, so you can just return false?
 

Fixed.

Here are the ChangeLog entries:

gcc/cp/ChangeLog
2013-11-21  Balaji V. Iyer  balaji.v.i...@intel.com

* cp-tree.h (cilk_valid_spawn): New prototype.
(gimplify_cilk_spawn): Likewise.
(cp_cilk_install_body_wframe_cleanup): Likewise.
(cilk_create_lambda_fn_tmp_var): Likewise.
(create_cilk_try_catch): Likewise.
* decl.c (finish_function): Insert Cilk function-calls when a
_Cilk_spawn is used in a function.
* parser.c (cp_parser_postfix_expression): Added RID_CILK_SPAWN and
RID_CILK_SYNC cases.
* cp-cilkplus.c (set_cilk_except_flag): New function.
(set_cilk_except_data): Likewise.
(cp_cilk_install_body_wframe_cleanup): Likewise.
(cilk_create_lambda_fn_tmp_var): Likewise.
* except.c (create_cilk_try_catch): Likewise.
* parser.h (IN_CILK_SPAWN): New #define.
* cp-objcp-common.h (LANG_HOOKS_CILKPLUS_GIMPLIFY_SPAWN): Likewise.
(LANG_HOOKS_CILKPLUS_DETECT_SPAWN_AND_UNWRAP): Likewise.
(LANG_HOOKS_CILKPLUS_FRAME_CLEANUP): Likewise.
* pt.c (tsubst_expr): Added CILK_SPAWN_STMT and CILK_SYNC_STMT cases.
* semantics.c (potential_constant_expression_1): Likewise.
(finish_call_expr): Stored the lambda function to a variable when Cilk
Plus is 

Re: [PATCH] Updated automated patch (was Re: [PATCH 3/6] Automated part of conversion of gimple types to use C++ inheritance)

2013-11-21 Thread Jakub Jelinek
On Thu, Nov 21, 2013 at 03:24:55PM -0700, Jeff Law wrote:
 On 11/21/13 15:19, Jakub Jelinek wrote:
 On Mon, Nov 18, 2013 at 03:25:52PM -0500, David Malcolm wrote:
 So is there some reason the GIMPLE_CHECK was left in here rather than
 doing the downcasting?  This happens in other places.
 
 Note that the changes removed tons of checks that IMHO were desirable.
 The as_a that replaced those checks e.g. allows 3 different gimple codes,
 while previously only one was allowed, this is both more expensive for
 --enable-checking=yes, and allows one to use inline wrappers e.g.
 gimple_omp_parallel_something on GIMPLE_OMP_TASK etc.
 Can you give a couple examples, please?

I mean e.g.
gimple_omp_parallel_{,set_}{clauses,child_fn,data_arg}{,_ptr}
gimple_omp_taskreg_{,set_}{clauses,child_fn,data_arg}{,_ptr}
gimple_omp_target_{,set_}{clauses,child_fn,data_arg}{,_ptr}
gimple_omp_teams_{,set_}clauses{,_ptr}
gimple_omp_return_{,set_}lhs{,_ptr}
gimple_omp_atomic_store_{,set_}val{,_ptr}
gimple_resx_{,set_}region
gimple_eh_dispatch_{,set_}region

Jakub


[patch] Atomic alignment override.

2013-11-21 Thread Andrew MacLeod
I'd like to check in this code from the C11 branch so that it is present 
in 4.9.


Its adds a target hook which can be used to override the default 
alignment of an atomic type when used with C11's _Atomic qualifier. 
There are a couple of ports which have stricter alignment requirements 
for an atomic operation than the natural alignment of the integral type. 
   Today they are just broken with no real facility to repair it.


If this hook is not utilized by a port, the code is transparent and 
should therefore be harmless in the code base.  It will enable the ports 
that care to experiment with changing alignment and see if their current 
problems can be resolved for C11  and then we can look to push that 
into C++11 atomic templates somehow.  It will also allow them to flush 
out any remaining problems that show up with the fundamental base code 
which enables it that went in as part of C11.


Bootstraps on x86-64 and currently running regression tests. Assuming 
everything passes, OK for mainline?


I wont actually check it in until HP reports back that it actually is 
useful to him, I just want to submit it before stage 1 ends :-).


Andrew
	
	* hooks.h (hook_uint_mode_0): Add Prototype.
	* hooks.c (hook_uint_mode_0): New default function.
	* target.def (atomic_align_for_mode): New target hook.
	* tree.c (build_atomic_base): Add alignment override parameter.
	(build_common_tree_nodes): Use atomic alignment override.
	* doc/tm.texi.in (TARGET_ATOMIC_ALIGN_FOR_MODE): Define.
	* doc/tm.texi (TARGET_ATOMIC_ALIGN_FOR_MODE): Add description.


Index: hooks.h
===
*** hooks.h	(revision 205220)
--- hooks.h	(working copy)
*** extern tree hook_tree_tree_tree_tree_3rd
*** 92,97 
--- 92,98 
  extern tree hook_tree_tree_int_treep_bool_null (tree, int, tree *, bool);
  
  extern unsigned hook_uint_void_0 (void);
+ extern unsigned int hook_uint_mode_0 (enum machine_mode);
  
  extern bool default_can_output_mi_thunk_no_vcall (const_tree, HOST_WIDE_INT,
  		  HOST_WIDE_INT, const_tree);
Index: hooks.c
===
*** hooks.c	(revision 205220)
--- hooks.c	(working copy)
*** hook_rtx_tree_int_null (tree a ATTRIBUTE
*** 358,363 
--- 358,370 
return NULL;
  }
  
+ /* Generic hook that takes a machine mode and returns an unsigned int 0.  */
+ unsigned int
+ hook_uint_mode_0 (enum machine_mode m ATTRIBUTE_UNUSED)
+ {
+   return 0;
+ }
+ 
  /* Generic hook that takes three trees and returns the last one as is.  */
  tree
  hook_tree_tree_tree_tree_3rd_identity (tree a ATTRIBUTE_UNUSED,
Index: target.def
===
*** target.def	(revision 205220)
--- target.def	(working copy)
*** DEFHOOKPOD
*** 5297,5302 
--- 5297,5313 
   @code{bool} @code{true}.,
   unsigned char, 1)
  
+ /* Return an unsigned int representing the alignment (in bits) of the atomic
+type which maps to machine MODE.  This allows alignment to be overridden
+as needed.  */
+ DEFHOOK
+ (atomic_align_for_mode,
+ If defined, this function returns an appropriate alignment in bits for an\
+  atomic object of machine_mode @var{mode}.  If 0 is returned then the\
+  default alignment for the specified mode is used. ,
+  unsigned int, (enum machine_mode mode),
+  hook_uint_mode_0)
+ 
  DEFHOOK
  (atomic_assign_expand_fenv,
  ISO C11 requires atomic compound assignments that may raise floating-point\
Index: tree.c
===
*** tree.c	(revision 205220)
--- tree.c	(working copy)
*** make_or_reuse_accum_type (unsigned size,
*** 9547,9556 
 during initialization of data types to create the 5 basic atomic
 types. The generic build_variant_type function requires these to
 already be set up in order to function properly, so cannot be
!called from there.  */
  
  static tree
! build_atomic_base (tree type)
  {
tree t;
  
--- 9547,9557 
 during initialization of data types to create the 5 basic atomic
 types. The generic build_variant_type function requires these to
 already be set up in order to function properly, so cannot be
!called from there.  If ALIGN is non-zero, then ensure alignment is
!overridden to this value.  */
  
  static tree
! build_atomic_base (tree type, unsigned int align)
  {
tree t;
  
*** build_atomic_base (tree type)
*** 9561,9566 
--- 9562,9570 
t = build_variant_type_copy (type);
set_type_quals (t, TYPE_QUAL_ATOMIC);
  
+   if (align)
+ TYPE_ALIGN (t) = align;
+ 
return t;
  }
  
*** build_common_tree_nodes (bool signed_cha
*** 9648,9660 
  
/* Don't call build_qualified type for atomics.  That routine does
   special processing for atomics, and until they are initialized
!  it's better not to make that call.  */
! 

[PATCH, DOC] Document the POWER HTM built-in functions

2013-11-21 Thread Peter Bergner
The following patch documents the HTM built-in functions specific
to POWER.  I bootstrapped and reviewed the gcc.info file and
everything looks good.

Ok for mainline?

Peter


* doc/extend.texi: Document htm builtins.

Index: gcc/doc/extend.texi
===
--- gcc/doc/extend.texi (revision 205232)
+++ gcc/doc/extend.texi (working copy)
@@ -9218,6 +9218,7 @@ instructions, but allow the compiler to 
 * picoChip Built-in Functions::
 * PowerPC Built-in Functions::
 * PowerPC AltiVec/VSX Built-in Functions::
+* PowerPC Hardware Transactional Memory Built-in Functions::
 * RX Built-in Functions::
 * S/390 System z Built-in Functions::
 * SH Built-in Functions::
@@ -15170,6 +15171,196 @@ The second argument to the @var{__builti
 integer that is 0 or 1.  The third argument to these builtin functions
 must be a constant integer in the range of 0 to 15.
 
+@node PowerPC Hardware Transactional Memory Built-in Functions
+@subsection PowerPC Hardware Transactional Memory Built-in Functions
+GCC provides two interfaces for accessing the Hardware Transactional
+Memory (HTM) instructions available on some of the PowerPC family
+of prcoessors (eg, POWER8).  The two interfaces come in a low level
+interface, consisting of built-in functions specific to PowerPC and a
+higher level interface consisting of inline functions that are common
+between PowerPC and S/390.
+
+@subsubsection PowerPC HTM Low Level Built-in Functions
+
+The following low level built-in functions are available with
+@option{-mhtm} or @option{-mcpu=CPU} where CPU is `power8' or later.
+They all generate the machine instruction that is part of the name.
+
+The HTM built-ins return true or false depending on their success and
+their arguments match exactly the type and order of the associated
+hardware instruction's operands.  Refer to the ISA manual for a
+description of each instruction's operands.
+
+@smallexample
+unsigned int __builtin_tbegin (unsigned int)
+unsigned int __builtin_tend (unsigned int)
+
+unsigned int __builtin_tabort (unsigned int)
+unsigned int __builtin_tabortdc (unsigned int, unsigned int, unsigned int)
+unsigned int __builtin_tabortdci (unsigned int, unsigned int, int)
+unsigned int __builtin_tabortwc (unsigned int, unsigned int, unsigned int)
+unsigned int __builtin_tabortwci (unsigned int, unsigned int, int)
+
+unsigned int __builtin_tcheck (unsigned int)
+unsigned int __builtin_treclaim (unsigned int)
+unsigned int __builtin_trechkpt (void)
+unsigned int __builtin_tsr (unsigned int)
+@end smallexample
+
+In addition to the above HTM built-ins, we have added built-ins for
+some common extended mnemonics of the HTM instructions:
+
+@smallexample
+unsigned int __builtin_tendall (void)
+unsigned int __builtin_tresume (void)
+unsigned int __builtin_tsuspend (void)
+@end smallexample
+
+The following set of built-in functions are available to gain access
+to the HTM specific special purpose registers.
+
+@smallexample
+unsigned long __builtin_get_texasr (void)
+unsigned long __builtin_get_texasru (void)
+unsigned long __builtin_get_tfhar (void)
+unsigned long __builtin_get_tfiar (void)
+
+void __builtin_set_texasr (unsigned long);
+void __builtin_set_texasru (unsigned long);
+void __builtin_set_tfhar (unsigned long);
+void __builtin_set_tfiar (unsigned long);
+@end smallexample
+
+Example usage of these low level built-in functions may look like:
+
+@smallexample
+#include htmintrin.h
+
+int num_retries = 10;
+
+while (1)
+  @{
+if (__builtin_tbegin (0))
+  @{
+/* Transaction State Initiated.  */
+if (is_locked (lock))
+  __builtin_tabort (0);
+... transaction code...
+__builtin_tend (0);
+break;
+  @}
+else
+  @{
+/* Transaction State Failed.  Use locks if the transaction
+   failure is persistent or we've tried too many times.  */
+if (num_retries-- = 0
+|| _TEXASRU_FAILURE_PERSISTENT (__builtin_get_texasru ()))
+  @{
+acquire_lock (lock);
+... non transactional fallback path...
+release_lock (lock);
+break;
+  @}
+  @}
+  @}
+@end smallexample
+
+One final built-in function has been added that returns the value of
+the 2-bit Transaction State field of the Machine Status Register (MSR)
+as stored in @code{CR0}.
+
+@smallexample
+unsigned long __builtin_ttest (void)
+@end smallexample
+
+This built-in can be used to determine the current transaction state
+using the following code example:
+
+@smallexample
+#include htmintrin.h
+
+unsigned char tx_state = _HTM_STATE (__builtin_ttest ());
+
+if (tx_state == _HTM_TRANSACTIONAL)
+  @{
+/* Code to use in transactional state.  */
+  @}
+else if (tx_state == _HTM_NONTRANSACTIONAL)
+  @{
+/* Code to use in non-transactional state.  */
+  @}
+else if (tx_state == _HTM_SUSPENDED)
+  @{
+/* Code to use in transaction suspended state.  */
+  @}

Re: [PATCH] Updated automated patch (was Re: [PATCH 3/6] Automated part of conversion of gimple types to use C++ inheritance)

2013-11-21 Thread Andrew MacLeod

On 11/21/2013 05:42 PM, Jakub Jelinek wrote:

On Thu, Nov 21, 2013 at 03:24:55PM -0700, Jeff Law wrote:

On 11/21/13 15:19, Jakub Jelinek wrote:

On Mon, Nov 18, 2013 at 03:25:52PM -0500, David Malcolm wrote:

So is there some reason the GIMPLE_CHECK was left in here rather than
doing the downcasting?  This happens in other places.

Note that the changes removed tons of checks that IMHO were desirable.
The as_a that replaced those checks e.g. allows 3 different gimple codes,
while previously only one was allowed, this is both more expensive for
--enable-checking=yes, and allows one to use inline wrappers e.g.
gimple_omp_parallel_something on GIMPLE_OMP_TASK etc.

Can you give a couple examples, please?

I mean e.g.
gimple_omp_parallel_{,set_}{clauses,child_fn,data_arg}{,_ptr}
gimple_omp_taskreg_{,set_}{clauses,child_fn,data_arg}{,_ptr}
gimple_omp_target_{,set_}{clauses,child_fn,data_arg}{,_ptr}
gimple_omp_teams_{,set_}clauses{,_ptr}
gimple_omp_return_{,set_}lhs{,_ptr}
gimple_omp_atomic_store_{,set_}val{,_ptr}
gimple_resx_{,set_}region
gimple_eh_dispatch_{,set_}region

Jakub
Why does  is_a_helper gimple_statement_omp_parallel::test allow 
anything other than a GIMPLE_OMP_PARALLEL..?  That seems wrong to me. 
should just be the one check.


gimple_omp_taskreg and other routines sharing that helper should have 
their own helper and only check the one code.. thats is whole point to 
remain at least codegen neutral in these cases and provide correct 
checking.   The fact that they may happen to share the same underlying 
structure is irrelevant.


I also think this is wrong.

Andrew


Re: [patch 3/3] Flatten gimple.h target and tests

2013-11-21 Thread Jeff Law

On 11/21/13 11:17, Andrew MacLeod wrote:

On 11/21/2013 01:15 PM, Andrew MacLeod wrote:

The third patch has the config/*  target changes, as well as a few
testcases.  I did *not*  trim out  includes for the targets since I
got caught earlier with targets requiring some files on only some
configurations.  I did go through an remove any duplicates that were
introduced tho. (ie, if basic-block.h was already being included, I
removed the duplicate which flattening introduced.) I didn't try
reducing the includes in the testcases since that doesn't really matter.

I did run make all-gcc on all the targets in config-list.mk, and don't
appear to have caused any new failures... the failures that are there
appear to be unrelated to compilation of the changed files.


the 3rd patch...

Andrew

gflat3.patch


* config/darwin.c: Add all include files removed from gimple.h.
* config/aarch64/aarch64-builtins.c: Likewise.
* config/aarch64/aarch64.c: Likewise.
* config/alpha/alpha.c: Likewise.
* config/i386/i386.c: Likewise.
* config/i386/winnt.c: Likewise.
* config/ia64/ia64.c: Likewise.
* config/m32c/m32c.c: Likewise.
* config/mep/mep.c: Likewise.
* config/mips/mips.c: Likewise.
* config/rs6000/rs6000.c: Likewise.
* config/s390/s390.c: Likewise.
* config/sh/sh.c: Likewise.
* config/sparc/sparc.c: Likewise.
* config/spu/spu.c: Likewise.
* config/stormy16/stormy16.c: Likewise.
* config/tilegx/tilegx.c: Likewise.
* config/tilepro/tilepro.c: Likewise.
* config/xtensa/xtensa.c: Likewise.

* testsuite/gcc.dg/plugin/finish_unit_plugin.c: Add all include
files removed from gimple.h.
* testsuite/gcc.dg/plugin/ggcplug.c: Likewise.
* testsuite/gcc.dg/plugin/one_time_plugin.c: Likewise.
* testsuite/gcc.dg/plugin/selfassign.c: Likewise.
* testsuite/gcc.dg/plugin/start_unit_plugin.c: Likewise.
* testsuite/g++.dg/plugin/selfassign.c: Likewise.

OK once prerequisites have gone in.

jeff



Re: [PATCH] Builtins handling in IVOPT

2013-11-21 Thread Wei Mi
On Thu, Nov 21, 2013 at 1:01 PM, Richard Biener
richard.guent...@gmail.com wrote:
 Wei Mi w...@google.com wrote:
On Thu, Nov 21, 2013 at 11:36 AM, Richard Biener
richard.guent...@gmail.com wrote:
 Wei Mi w...@google.com wrote:
 So what you are doing is basically not only rewriting memory
references
 to possibly use TARGET_MEM_REF but also address uses to use
 TARGET_MEM_REF.  I think this is a good thing in general
 (given instructions like x86 lea) and I would not bother
distinguishing
 the different kind of uses.

 Richard.


You mean to change normal expr to TMR(expr) form in order to utilize
x86 lea type instructions as much as possible. It is interesting. I
can experiment that idea later. I am not sure if it could simply
work.
My concern is x86 lea still has some limitation (such as three
operands lea will have longer latency and can only be issued to
port1), if we change some expr to TMR(expr), will it inhitbit cse
opportunity if codegen find out it is not good to use lea?

 That needs to be determined.  Over all it might be because ivopts
runs so early.  At rtl level there should not be big differences apart
from better initial address computations.

 Did I misunderstand what your patch does?

 Richard.


My patch wants to address the issue that iv uses using as memory
reference actuals for load/store/prefetch builtins are treated as
non-linear iv uses instead of address iv uses, and the result of
determine_use_iv_cost is wrong. After we change those uses to address
uses, less ivs may be used, TMR will be generated for those iv uses
and efficent addressing mode could be utilized.

 But are not all pointer typed uses address uses?!

 Richard.


If a pointer typed use is plainly value passed to a func call, it is
not an address use, right? But as you said, x86 lea may help here.

Thanks,
Wei.


[PATCH] Remove location wrapping in tree-vectorizer.h

2013-11-21 Thread Richard Biener

tree-vectorizer.h wraps all of source_location, LOCATION_LINE,
LOCATION_FILE and UNKNOWN_LOCATION.

The following gets rid of that pointless excercise.

Bootstrapped / tested on x86_64-unknown-linux-gnu, applied.

Richard.

2013-11-21  Richard Biener  rguent...@suse.de

* tree-vectorizer.h (LOC, UNKNOWN_LOC, EXPR_LOC, LOC_FILE,
LOC_LINE): Remove wrappers and fix all users.
(struct _loop_vec_info): Remove loop_line_number member.
(LOOP_VINFO_LOC): Remove.
* tree-parloops.c, tree-vect-loop-manip.c, tree-vect-slp.c,
tree-vectorizer.c: Fix users of LOC, UNKNOWN_LOC, EXPR_LOC, LOC_FILE
and LOC_LINE.

Index: trunk/gcc/tree-parloops.c
===
*** trunk.orig/gcc/tree-parloops.c  2013-11-19 16:21:01.0 +0100
--- trunk/gcc/tree-parloops.c   2013-11-21 10:40:02.260898174 +0100
*** parallelize_loops (void)
*** 2145,2151 
reduction_info_table_type reduction_list;
struct obstack parloop_obstack;
HOST_WIDE_INT estimated;
!   LOC loop_loc;
  
/* Do not parallelize loops in the functions created by parallelization.  */
if (parallelized_function_p (cfun-decl))
--- 2145,2151 
reduction_info_table_type reduction_list;
struct obstack parloop_obstack;
HOST_WIDE_INT estimated;
!   source_location loop_loc;
  
/* Do not parallelize loops in the functions created by parallelization.  */
if (parallelized_function_p (cfun-decl))
*** parallelize_loops (void)
*** 2225,2233 
else
  fprintf (dump_file, parallelizing inner loop 
%d\n,loop-header-index);
loop_loc = find_loop_location (loop);
!   if (loop_loc != UNKNOWN_LOC)
  fprintf (dump_file, \nloop at %s:%d: ,
!  LOC_FILE (loop_loc), LOC_LINE (loop_loc));
}
gen_parallel_loop (loop, reduction_list,
 n_threads, niter_desc);
--- 2225,2233 
else
  fprintf (dump_file, parallelizing inner loop 
%d\n,loop-header-index);
loop_loc = find_loop_location (loop);
!   if (loop_loc != UNKNOWN_LOCATION)
  fprintf (dump_file, \nloop at %s:%d: ,
!  LOCATION_FILE (loop_loc), LOCATION_LINE (loop_loc));
}
gen_parallel_loop (loop, reduction_list,
 n_threads, niter_desc);
Index: trunk/gcc/tree-vect-loop-manip.c
===
*** trunk.orig/gcc/tree-vect-loop-manip.c   2013-11-21 10:25:01.0 
+0100
--- trunk/gcc/tree-vect-loop-manip.c2013-11-21 10:52:58.994743603 +0100
*** slpeel_make_loop_iterate_ntimes (struct
*** 661,667 
bool insert_after;
tree init = build_int_cst (TREE_TYPE (niters), 0);
tree step = build_int_cst (TREE_TYPE (niters), 1);
!   LOC loop_loc;
enum tree_code code;
  
orig_cond = get_loop_exit_condition (loop);
--- 661,667 
bool insert_after;
tree init = build_int_cst (TREE_TYPE (niters), 0);
tree step = build_int_cst (TREE_TYPE (niters), 1);
!   source_location loop_loc;
enum tree_code code;
  
orig_cond = get_loop_exit_condition (loop);
*** slpeel_make_loop_iterate_ntimes (struct
*** 691,699 
loop_loc = find_loop_location (loop);
if (dump_enabled_p ())
  {
!   if (LOCATION_LOCUS (loop_loc) != UNKNOWN_LOC)
!   dump_printf (MSG_NOTE, \nloop at %s:%d: , LOC_FILE (loop_loc),
!LOC_LINE (loop_loc));
dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0);
dump_printf (MSG_NOTE, \n);
  }
--- 691,699 
loop_loc = find_loop_location (loop);
if (dump_enabled_p ())
  {
!   if (LOCATION_LOCUS (loop_loc) != UNKNOWN_LOCATION)
!   dump_printf (MSG_NOTE, \nloop at %s:%d: , LOCATION_FILE (loop_loc),
!LOCATION_LINE (loop_loc));
dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0);
dump_printf (MSG_NOTE, \n);
  }
*** slpeel_tree_peel_loop_to_edge (struct lo
*** 1057,1063 
basic_block new_exit_bb;
gimple_stmt_iterator gsi;
edge exit_e = single_exit (loop);
!   LOC loop_loc;
tree cost_pre_condition = NULL_TREE;
/* There are many aspects to how likely the first loop is going to be 
executed.
   Without histogram we can't really do good job.  Simply set it to
--- 1057,1063 
basic_block new_exit_bb;
gimple_stmt_iterator gsi;
edge exit_e = single_exit (loop);
!   source_location loop_loc;
tree cost_pre_condition = NULL_TREE;
/* There are many aspects to how likely the first loop is going to be 
executed.
   Without histogram we can't really do good job.  Simply set it to
*** slpeel_tree_peel_loop_to_edge (struct lo
*** 1365,1371 
 location is calculated.
 Return the loop location if succeed and NULL if not.  */
  
! LOC
  find_loop_location (struct loop *loop)
  {
gimple stmt 

Re: RFA (cgraph): C++ 'structor decloning patch, Mark III

2013-11-21 Thread Joseph S. Myers
The new option should be able to use Var() in the .opt file rather than 
having a variable defined explicitly or any explicit handling code in 
c_common_handle_option, and shouldn't need to use UInteger (given the 
option has no arguments).

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: Implement C11 _Atomic

2013-11-21 Thread Hans-Peter Nilsson
On Thu, 21 Nov 2013, Andrew MacLeod wrote:
 I can bootstrap and check this on x86 to make sure it doesnt affect anything,
 and you can fool with it and see if you can get your desired results with your
 port.

Success!

For the record, tested together with the attached patch for the
CRIS ports, both regularly (not finished, but done with the C
testsuite part and no regressions there), as well as manually
for the attached test-programs, compiling and inspecting output
for different sub-targets and checking that data layout,
alignment and size is as intended.

Too bad about the libstdc++ atomics, but with this/these patches
at least I'll be able to tell people that _Atomic for C11 works.

Thanks to the both of you!

brgds, H-PIndex: config/cris/cris.c
===
--- config/cris/cris.c  (revision 205225)
+++ config/cris/cris.c  (working copy)
@@ -93,6 +93,8 @@ static int cris_reg_overlap_mentioned_p 
 static enum machine_mode cris_promote_function_mode (const_tree, enum 
machine_mode,
 int *, const_tree, int);
 
+static unsigned int cris_atomic_align_for_mode (enum machine_mode);
+
 static void cris_print_base (rtx, FILE *);
 
 static void cris_print_index (rtx, FILE *);
@@ -227,6 +229,9 @@ int cris_cpu_version = CRIS_DEFAULT_CPU_
 #undef TARGET_PROMOTE_FUNCTION_MODE
 #define TARGET_PROMOTE_FUNCTION_MODE cris_promote_function_mode
 
+#undef TARGET_ATOMIC_ALIGN_FOR_MODE
+#define TARGET_ATOMIC_ALIGN_FOR_MODE cris_atomic_align_for_mode
+
 #undef TARGET_STRUCT_VALUE_RTX
 #define TARGET_STRUCT_VALUE_RTX cris_struct_value_rtx
 #undef TARGET_SETUP_INCOMING_VARARGS
@@ -4018,6 +4023,14 @@ cris_promote_function_mode (const_tree t
 return mode;
   return CRIS_PROMOTED_MODE (mode, *punsignedp, type);
 } 
+
+/* Atomic types require alignment to be at least the natural size. */
+
+static unsigned int
+cris_atomic_align_for_mode (enum machine_mode mode)
+{
+  return GET_MODE_BITSIZE (mode);
+}
 
 /* Let's assume all functions return in r[CRIS_FIRST_ARG_REG] for the
time being.  */
struct r0 {
  char a;
  int b;
};

struct l0 {
  char a;
  int b __attribute__((__aligned__(4)));
};

struct a0 {
  char a;
  _Atomic int b;
};

#define test(name, cond) typedef int test_ ## name [-!(cond)]

struct a0 ag;
struct l0 lg;
struct r0 rg;

struct a0 ai = {0, 1};
struct l0 li = {1, 2};
struct r0 ri = {2, 3};

extern struct a0 ae;
extern struct l0 le;
extern struct r0 re;

void foo(void)
{
  struct a0 al;
  struct l0 ll;
  struct r0 rl;
  test(a_al_ge_4, __alignof__(al) = 4);
  test(a_al_ge_ll, __alignof__(al) = __alignof__(ll));
  test(a_al_ge_rl, __alignof__(al) = __alignof__(rl));
  test(a_ag_ge_4, __alignof__(ag) = 4);
  test(a_ag_ge_lg, __alignof__(ag) = __alignof__(lg));
  test(a_ag_ge_rg, __alignof__(ag) = __alignof__(rg));
  test(a_ai_ge_4, __alignof__(ai) = 4);
  test(a_ai_ge_li, __alignof__(ai) = __alignof__(li));
  test(a_ai_ge_ri, __alignof__(ai) = __alignof__(ri));
  test(a_ae_ge_4, __alignof__(ae) = 4);
  test(a_ae_ge_le, __alignof__(ae) = __alignof__(le));
  test(a_ae_ge_re, __alignof__(ae) = __alignof__(re));

  test(s_al_ge_4, sizeof(al) = 8);
  test(s_al_ge_ll, sizeof(al) = sizeof(ll));
  test(s_al_ge_rl, sizeof(al) = sizeof(rl));
  test(s_ag_ge_4, sizeof(ag) = 8);
  test(s_ag_ge_lg, sizeof(ag) = sizeof(lg));
  test(s_ag_ge_rg, sizeof(ag) = sizeof(rg));
  test(s_ai_ge_4, sizeof(ai) = 8);
  test(s_ai_ge_lg, sizeof(ai) = sizeof(li));
  test(s_ai_ge_rg, sizeof(ai) = sizeof(ri));
  test(s_ae_ge_4, sizeof(ae) = 8);
  test(s_ae_ge_le, sizeof(ae) = sizeof(le));
  test(s_ae_ge_re, sizeof(ae) = sizeof(re));

  test(ab_eq_4, __builtin_offsetof(struct a0, b) == 4);
  al = ai;
  al.b++;
  ae = al;
}
_Atomic int foo;
int bar;
void baz(void)
{
  foo++;
}
void foobar(void)
{
  bar++;
}

void xyzzy(int *x)
{
  (*x)++;
}

void plugh(_Atomic int *x)
{
  (*x)++;
}

void xyzzy1(int *x)
{
  int y = *x;
  *x = y+1;
}

void plugh2(_Atomic int *x)
{
  _Atomic int y = *x;
  *x = y+1;
}


Re: [PATCH] Enable multiple duplicated blocks on threading path

2013-11-21 Thread H.J. Lu
On Tue, Nov 19, 2013 at 9:11 PM, Jeff Law l...@redhat.com wrote:
 On 11/19/13 19:33, David Malcolm wrote:


 FWIW, it looks like you attached the whitespace cleanup patch again,
 rather than the one you discuss above.

 For the archives, it looks like your email is referring to r205074
 (though that itself seems to have some purely whitespace fixes, together
 with the real changes).

 Nuts.

 Here's the actual change I meant to attach.

 Sorry about that, thanks for catching it.

 Jeff


 commit f28876ff94428d3109e9e575d6a2d62d23af8092
 Author: law law@138bc75d-0d04-0410-961f-82ee72b054a4
 Date:   Wed Nov 20 01:55:17 2013 +

 * tree-ssa-threadedge.c (thread_across_edge): After threading
 through a joiner, allow threading a normal block requiring
 duplication.

 * tree-ssa-threadupdate.c (thread_block_1): Improve code to detect
 jump threading requests that would muck up the loop structures.

 git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@205074
 138bc75d-0d04-0410-961f-82ee72b054a4


This caused:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59221

-- 
H.J.


Re: [patch 1/3] Flatten gimple.h

2013-11-21 Thread Andrew MacLeod

On 11/21/2013 03:07 PM, Jeff Law wrote:

On 11/21/13 13:04, Andrew MacLeod wrote:

On 11/21/2013 02:26 PM, Jeff Law wrote:

On 11/21/13 11:15, Andrew MacLeod wrote:


Is there anything in particular one needs to do for plugins? I 
thought I
saw a patch somewhere that changed something in the Makefile, but 
don't
know if that is actually required since I never did that for any of 
the

others.   Any plugin which used gimple.h probably needs a few more
includes...

We need to make sure the header files that are needed by plugins
appear in Makefile.in::PLUGIN_HEADERS so that they get installed in a
place where plugins can find them.



stupid question perhaps, but aren't most  header files a potential
plugin header?Why don't we just install them all...
I think that's basically what's happened in the past, we just 
installed everything, or close to everything.


One way to find out would be to look at the set of .h files from 
gcc-4.8/gcc and look at what ultimately ends up in PLUGIN_HEADERS.  I 
bet they're pretty damn close :-)




  No one has complained yet, but in theory any .h I split up over the
past couple of months has the potential to be required... maintaining
that macro in Makefile.in seems kinda lame now that we don't maintain
the macros for building.  I'm sure its rotted already.
I wouldn't expect much fallout until after we started putting release 
candidates out there.  That doesn't mean we should wait until then to 
address the problem though ;-)
Of course it does :-)  If its a big enough issue, then maybe we fix it 
better :-)


Anyway, I'll add this to the patch...

Im going to recheck out trunk and rebuild and compare stuff, I'm getting 
weird spurious issues after I updated.  Probably be tomorrow sometime 
before I manage the checkin.


Andrew




* Makefile.in (PLUGIN_HEADERS): Add flattened gimple.h includes.

Index: Makefile.in
===
*** Makefile.in (revision 205213)
--- Makefile.in (working copy)
*** installdirs:
*** 3103,3109 
$(mkinstalldirs) $(DESTDIR)$(man7dir)
  
  PLUGIN_HEADERS = $(TREE_H) $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
!   toplev.h $(DIAGNOSTIC_CORE_H) $(BASIC_BLOCK_H) $(GIMPLE_H) $(TREE_PASS_H) 
$(GCC_PLUGIN_H) \
$(GGC_H) $(TREE_DUMP_H) $(PRETTY_PRINT_H) $(OPTS_H) $(PARAMS_H) \
$(tm_file_list) $(tm_include_list) $(tm_p_file_list) $(tm_p_include_list) \
$(host_xm_file_list) $(host_xm_include_list) $(xm_include_list) \
--- 3103,3111 
$(mkinstalldirs) $(DESTDIR)$(man7dir)
  
  PLUGIN_HEADERS = $(TREE_H) $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
!   toplev.h $(DIAGNOSTIC_CORE_H) $(BASIC_BLOCK_H) pointer-set.h 
$(HASH_TABLE_H) \
!   tree-ssa-alias.h $(INTERNAL_FN_H) gimple-fold.h tree-eh.h gimple-expr.h \
!   gimple.h is-a.h $(TREE_PASS_H) $(GCC_PLUGIN_H) \
$(GGC_H) $(TREE_DUMP_H) $(PRETTY_PRINT_H) $(OPTS_H) $(PARAMS_H) \
$(tm_file_list) $(tm_include_list) $(tm_p_file_list) $(tm_p_include_list) \
$(host_xm_file_list) $(host_xm_include_list) $(xm_include_list) \


Re: Implement C11 _Atomic

2013-11-21 Thread Andrew MacLeod

On 11/21/2013 06:23 PM, Hans-Peter Nilsson wrote:

On Thu, 21 Nov 2013, Andrew MacLeod wrote:

I can bootstrap and check this on x86 to make sure it doesnt affect anything,
and you can fool with it and see if you can get your desired results with your
port.

Success!

For the record, tested together with the attached patch for the
CRIS ports, both regularly (not finished, but done with the C
testsuite part and no regressions there), as well as manually
for the attached test-programs, compiling and inspecting output
for different sub-targets and checking that data layout,
alignment and size is as intended.

Too bad about the libstdc++ atomics, but with this/these patches
at least I'll be able to tell people that _Atomic for C11 works.

Thanks to the both of you!


All we need is a way to communicate the atomic property to the type 
within the libstdc++ template...  We cant use _Atomic there :-P


 I originally had created an __attribute__ ((atomic)) whch you could 
apply to the atomic template type and get the same behaviour  as 
_Atomic, Im not sure if there is another way or not.  The atomic 
template is generic, so I couldn't think of any special macro wizardry 
we could define...


Andrew


Re: [PATCH] Fix PR target/59233 (ICE on *-apple-darwin* with -m32 exposed by -freorder-blocks-and-partition)

2013-11-21 Thread Steven Bosscher
On Thu, Nov 21, 2013 at 8:57 PM, Teresa Johnson wrote:
 There are two problems I am fixing here (see
 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59233 for full analysis).

 The first is the original ICE in crossjump optimization, which was
 exposed by enabling -freorder-blocks-and-partition which was
 conservatively preventing some upstream block combining optimizations.
 The issue is that the block ended in a NOTE_INSN_DELETED, which
 old_insns_match_p could not handle. I am fixing this by passing
 old_insns_match_p the last instruction in the block that it knows how
 to handle.

 The second issue this exposed was that we were unnecessarily marking
 landing pad branches EDGE_PRESERVE since
 flag_reorder_blocks_and_partition was on, even though this was -Os and
 we will gate partitioning off.

So we keep an edge to a landing pad... Why is this a problem?


 * bb-reorder.c (do_partition_blocks): New function.
 (gate_handle_partition_blocks): Call do_partition_blocks.
 * bb-reorder.h (do_partition_blocks): Declare.
 * except.c (dw2_build_landing_pads): Call do_partition_blocks.

Exporting this gate function from bb-reorder.c shouldn't be necessary.
Better fix this at the root, in except.c.


 * cfgcleanup.c (outgoing_edges_match): Walk up past note instructions
 not understood by old_insns_match_p.

This part is OK.

Ciao!
Steven


[PATCH, i386]: Use ix86_zero_extend_to_Pmode where appropriate

2013-11-21 Thread Uros Bizjak
Hello!

2013-11-21  Uros Bizjak  ubiz...@gmail.com

* config/i386/i386.c (ix86_expand_special_args_builtin): Use
ix86_zero_extend_to_Pmode where appropriate.
(ix86_expand_builtin): Ditto.

Tested on x86_64-pc-linux-gnu {,-m32} and committed to mainline SVN.

Uros.
Index: config/i386/i386.c
===
--- config/i386/i386.c  (revision 205232)
+++ config/i386/i386.c  (working copy)
@@ -32577,7 +32577,7 @@ ix86_expand_special_args_builtin (const struct bui
   gcc_assert (target == 0);
   if (memory)
{
- op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
+ op = ix86_zero_extend_to_Pmode (op);
  target = gen_rtx_MEM (tmode, op);
}
   else
@@ -32622,7 +32622,7 @@ ix86_expand_special_args_builtin (const struct bui
  if (i == memory)
{
  /* This must be the memory operand.  */
- op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
+ op = ix86_zero_extend_to_Pmode (op);
  op = gen_rtx_MEM (mode, op);
  gcc_assert (GET_MODE (op) == mode
  || GET_MODE (op) == VOIDmode);
@@ -32870,7 +32870,7 @@ ix86_expand_builtin (tree exp, rtx target, rtx sub
   mode1 = insn_data[icode].operand[1].mode;
   mode2 = insn_data[icode].operand[2].mode;
 
-  op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
+  op0 = ix86_zero_extend_to_Pmode (op0);
   op0 = gen_rtx_MEM (mode1, op0);
 
   if (!insn_data[icode].operand[0].predicate (op0, mode0))
@@ -32902,7 +32902,7 @@ ix86_expand_builtin (tree exp, rtx target, rtx sub
op0 = expand_normal (arg0);
icode = CODE_FOR_sse2_clflush;
if (!insn_data[icode].operand[0].predicate (op0, Pmode))
- op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
+ op0 = ix86_zero_extend_to_Pmode (op0);
 
emit_insn (gen_sse2_clflush (op0));
return 0;
@@ -32915,7 +32915,7 @@ ix86_expand_builtin (tree exp, rtx target, rtx sub
   op1 = expand_normal (arg1);
   op2 = expand_normal (arg2);
   if (!REG_P (op0))
-   op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
+   op0 = ix86_zero_extend_to_Pmode (op0);
   if (!REG_P (op1))
op1 = copy_to_mode_reg (SImode, op1);
   if (!REG_P (op2))
@@ -33172,7 +33172,7 @@ ix86_expand_builtin (tree exp, rtx target, rtx sub
   op0 = expand_normal (arg0);
   icode = CODE_FOR_lwp_llwpcb;
   if (!insn_data[icode].operand[0].predicate (op0, Pmode))
-   op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
+   op0 = ix86_zero_extend_to_Pmode (op0);
   emit_insn (gen_lwp_llwpcb (op0));
   return 0;
 
@@ -33468,7 +33468,7 @@ addcarryx:
   /* Force memory operand only with base register here.  But we
 don't want to do it on memory operand for other builtin
 functions.  */
-  op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
+  op1 = ix86_zero_extend_to_Pmode (op1);
 
   if (!insn_data[icode].operand[1].predicate (op0, mode0))
op0 = copy_to_mode_reg (mode0, op0);


Re: [patch] Atomic alignment override.

2013-11-21 Thread Andrew MacLeod

On 11/21/2013 05:46 PM, Andrew MacLeod wrote:
I'd like to check in this code from the C11 branch so that it is 
present in 4.9.


Its adds a target hook which can be used to override the default 
alignment of an atomic type when used with C11's _Atomic qualifier. 
There are a couple of ports which have stricter alignment requirements 
for an atomic operation than the natural alignment of the integral 
type.Today they are just broken with no real facility to repair it.


If this hook is not utilized by a port, the code is transparent and 
should therefore be harmless in the code base.  It will enable the 
ports that care to experiment with changing alignment and see if their 
current problems can be resolved for C11  and then we can look to 
push that into C++11 atomic templates somehow.  It will also allow 
them to flush out any remaining problems that show up with the 
fundamental base code which enables it that went in as part of C11.


Bootstraps on x86-64 and currently running regression tests. Assuming 
everything passes, OK for mainline?


I wont actually check it in until HP reports back that it actually is 
useful to him, I just want to submit it before stage 1 ends :-).


Andrew
H-P reports back that  it solves the issues for CRIS : 
http://gcc.gnu.org/ml/gcc-patches/2013-11/msg02774.html


So I'd like to put it in.  It would be nice to enable this for the type 
within the C++11 atomic template too, but Im not sure there is an easy 
way short of a new __attribute__((atomic)) or something like that...



Andrew


Re: Implement C11 _Atomic

2013-11-21 Thread Hans-Peter Nilsson
On Thu, 21 Nov 2013, Hans-Peter Nilsson wrote:
 with this/these patches
 at least I'll be able to tell people that _Atomic for C11 works.

Oh right, gcc still doesn't remove target-introduced manual
alignment checks (when expanding atomic intrinsics), but at
least gcc makes sure it's aligned on stack, when options doesn't
say it's aligned.  And a.c:plugh2 doesn't seem to perform an
atomic assignment, but just assignment through an
_Atomic-aligned stack temporary.  Might be my C11-ignorance
showing.

(Without the patches layout and alignment is all broken.)

brgds, H-P


Plugin headers (was Re: [patch 1/3] Flatten gimple.h)

2013-11-21 Thread David Malcolm
On Thu, 2013-11-21 at 13:07 -0700, Jeff Law wrote:
 On 11/21/13 13:04, Andrew MacLeod wrote:
  On 11/21/2013 02:26 PM, Jeff Law wrote:
  On 11/21/13 11:15, Andrew MacLeod wrote:
 
  Is there anything in particular one needs to do for plugins? I thought I
  saw a patch somewhere that changed something in the Makefile, but don't
  know if that is actually required since I never did that for any of the
  others.   Any plugin which used gimple.h probably needs a few more
  includes...
  We need to make sure the header files that are needed by plugins
  appear in Makefile.in::PLUGIN_HEADERS so that they get installed in a
  place where plugins can find them.
 
 
  stupid question perhaps, but aren't most  header files a potential
  plugin header?Why don't we just install them all...
 I think that's basically what's happened in the past, we just installed 
 everything, or close to everything.
 
 One way to find out would be to look at the set of .h files from 
 gcc-4.8/gcc and look at what ultimately ends up in PLUGIN_HEADERS.  I 
 bet they're pretty damn close :-)
 
 
No one has complained yet, but in theory any .h I split up over the
  past couple of months has the potential to be required... maintaining
  that macro in Makefile.in seems kinda lame now that we don't maintain
  the macros for building.  I'm sure its rotted already.
 I wouldn't expect much fallout until after we started putting release 
 candidates out there.  That doesn't mean we should wait until then to 
 address the problem though ;-)

FWIW I can have a go at building/porting gcc-python-plugin against trunk
sometime early in stage3 [1]; that one pokes at a lot of different
things.

Dave

[1] like tomorrow :)



Re: [PATCH, M2] Compiler driver patches

2013-11-21 Thread Gaius Mulley
Joseph S. Myers jos...@codesourcery.com writes:

 On Wed, 20 Nov 2013, Gaius Mulley wrote:

 Thanks for all the comments regarding the set of patches.  Perhaps the
 statement use its own linker is misleading.  When gm2 is asked to link
 a module hello.mod it does the following:

 Thanks for the explanation.  I think this is sufficiently complicated that 
 driver patches to facilitate it can't effectively be reviewed separately 
 from the gm2 driver itself (i.e. the two patches should be posted for 
 review at the same time), unless the driver patches make sense as cleanups 
 in their own right in the context of the front ends already in tree.

 The sort of change that's more suitable for review indepedent of the front 
 end would be e.g. fixes for optimization bugs where you've only triggered 
 the bug with M2 input but will propose the patch on the basis that the 
 GIMPLE produced by the front end is valid and is being transformed in an 
 invalid way, even if no other front end produces that GIMPLE.

Again thank you for the feedback - yes this all sounds sensible.  I'll
work on the patch for the 'gm2' driver and associated conforming front
end config and makefile files and submit them together with the above,

regards,
Gaius




[Patch 0/4, AArch64] Conform vector implementation to ABI.

2013-11-21 Thread Tejas Belagod


Hi,

This patch series fixes aarch64's autovectorization and gcc FE vector extension 
programming models for ABI conformance. The ABI states


Elements in a short vector are numbered such that the lowest numbered element
(element 0) occupies the lowest numbered bit (bit zero) in the vector and 
successive elements take on progressively increasing bit positions in the
vector. When a short vector transferred between registers and memory it is 
treated as an opaque object. That is a short vector is stored in memory as if it 
were stored with a single STR of the entire register; a short vector is loaded 
from memory using the corresponding LDR instruction. On a little-endian system 
this means that element 0 will always contain the lowest addressed element of a 
short vector; on a bigendian system element 0 will contain the highest-addressed 
element of a short vector.


To conform to ABI, this patch fixes vector mode loads to be LDR D/Q and stores 
to STR D/Q. This means that the order of the elements in a register are reversed 
for big-endian when loaded from memory. This incidentally mirrors the way gcc 
implements its vectors in RTL therefore becomes easy to interpret standard 
pattern names and RTL lane numbers while expansion as the order is the same as 
the NEON register. The data-flow seems to fall out quite easily. For example, 
the widening standard patterns in RTL expect high and low parts to be reversed 
for Big-Endian, and because we use LDR Q, the low and high parts of vectors are 
already reversed - we don't have to jump though hoops to fix this up. In 
contrast, the narrowing operations need the reversing as will be evident in the 
patches that follow. Simliarly the reduc_* patterns expect the scalar result in 
the LSB of the RTL register which is the same as (n-1)th lane in Bigendian which 
is the same lane when we conform to ABI.


In a series of 4 patches we fixes ABI conformance and fix some fall-out of bugs 
from that.


This set however does not fix up the model for NEON intrinsics that maps vld1_* 
to movmode and its associated lane accesses - this is coming soon.


Thanks,
Tejas Belagod.
ARM.



Re: [PATCH] Make forwprop fold series of VIEW_CONVERT_EXPRs

2013-11-21 Thread Jeff Law

On 11/21/13 04:20, Richard Biener wrote:


This moves another fold-const.c folding to the GIMPLE level.
In PR59058 it was noticed we fail to optimize

Well, you duplicated the optimization, you didn't move it :(

Of course the latter takes a lot more time to verify that nothing regresses.

Anyway, it still a step forward

jeff



Re: [RFC] Modify -g1 to produce line tables

2013-11-21 Thread Cary Coutant
 Having just looked at the opts.c and tree-ssa-live.c changes, they're fine.

Thanks, I've committed the patch.

-cary


Re: [fortran, patch] Add Fortran 2003 IEEE intrinsic modules

2013-11-21 Thread N.M. Maclaren

On Nov 21 2013, Uros Bizjak wrote:


Indeed, 387/SSE has flush-to-zero modes. But other APIs do not (glibc, 
SysV, AIX). I'm perfectly willing to add it, especially to 387/SSE, if 
given a bit of help (someone to write the assembly code).


Just set FTZ bit in mxcsr. Please see
libgcc/config/i386/crtfastmath.c, this mode is used when -ffast-math
is used.


Does that work for all of Intel's zoo of floating-point mechanisms?
Even if gfortran doesn't use all of them, it is important to at least
know which it works for and which it does not.  My understanding of
the architecture is that all of x87, MMX, SSE and AVX are potentially
different in this respect :-(


Regards,
Nick Maclaren.



[Patch 4/4] Conform vector implementation to ABI -- narrowing operations.

2013-11-21 Thread Tejas Belagod


Hi,

The attached patch swaps around high and low bits of the source operands of 
narrow patterns for big-endian so that they end up in the correct order in the 
destination.


Tested for aarch64-none-elf and aarch64_be-none-elf. OK for trunk?

Thanks,
Tejas Belagod
ARM.

2013-11-21  Tejas Belagod  tejas.bela...@arm.com

gcc/
* config/aarch64/aarch64-simd.md (vec_pack_trunc_mode,
vec_pack_trunc_v2df, vec_pack_trunc_df): Swap source ops for big-endian.diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index b9ebdf5..c802a27 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -891,9 +891,11 @@
  TARGET_SIMD
 {
   rtx tempreg = gen_reg_rtx (VDBLmode);
+  int lo = BYTES_BIG_ENDIAN ? 2 : 1;
+  int hi = BYTES_BIG_ENDIAN ? 1 : 2;
 
-  emit_insn (gen_move_lo_quad_Vdbl (tempreg, operands[1]));
-  emit_insn (gen_move_hi_quad_Vdbl (tempreg, operands[2]));
+  emit_insn (gen_move_lo_quad_Vdbl (tempreg, operands[lo]));
+  emit_insn (gen_move_hi_quad_Vdbl (tempreg, operands[hi]));
   emit_insn (gen_aarch64_simd_vec_pack_trunc_Vdbl (operands[0], tempreg));
   DONE;
 })
@@ -906,7 +908,12 @@
 (truncate:VNARROWQ (match_operand:VQN 1 register_operand w))
 (truncate:VNARROWQ (match_operand:VQN 2 register_operand w]
  TARGET_SIMD
- xtn\\t%0.Vntype, %1.Vtype\;xtn2\\t%0.V2ntype, %2.Vtype
+ {
+   if (BYTES_BIG_ENDIAN)
+ return xtn\\t%0.Vntype, %2.Vtype\;xtn2\\t%0.V2ntype, %1.Vtype;
+   else
+ return xtn\\t%0.Vntype, %1.Vtype\;xtn2\\t%0.V2ntype, %2.Vtype;
+ }
   [(set_attr type multiple)
(set_attr length 8)]
 )
@@ -1444,9 +1451,12 @@
   TARGET_SIMD
   {
 rtx tmp = gen_reg_rtx (V2SFmode);
-emit_insn (gen_aarch64_float_truncate_lo_v2sf (tmp, operands[1]));
+int lo = BYTES_BIG_ENDIAN ? 2 : 1;
+int hi = BYTES_BIG_ENDIAN ? 1 : 2;
+
+emit_insn (gen_aarch64_float_truncate_lo_v2sf (tmp, operands[lo]));
 emit_insn (gen_aarch64_float_truncate_hi_v4sf (operands[0],
-  tmp, operands[2]));
+  tmp, operands[hi]));
 DONE;
   }
 )
@@ -1462,8 +1472,11 @@
   TARGET_SIMD
   {
 rtx tmp = gen_reg_rtx (V2SFmode);
-emit_insn (gen_move_lo_quad_v2df (tmp, operands[1]));
-emit_insn (gen_move_hi_quad_v2df (tmp, operands[2]));
+int lo = BYTES_BIG_ENDIAN ? 2 : 1;
+int hi = BYTES_BIG_ENDIAN ? 1 : 2;
+
+emit_insn (gen_move_lo_quad_v2df (tmp, operands[lo]));
+emit_insn (gen_move_hi_quad_v2df (tmp, operands[hi]));
 emit_insn (gen_aarch64_float_truncate_lo_v2sf (operands[0], tmp));
 DONE;
   }

Re: [patch 1/3] Flatten gimple.h

2013-11-21 Thread Jeff Law

On 11/21/13 13:04, Andrew MacLeod wrote:

On 11/21/2013 02:26 PM, Jeff Law wrote:

On 11/21/13 11:15, Andrew MacLeod wrote:


Is there anything in particular one needs to do for plugins? I thought I
saw a patch somewhere that changed something in the Makefile, but don't
know if that is actually required since I never did that for any of the
others.   Any plugin which used gimple.h probably needs a few more
includes...

We need to make sure the header files that are needed by plugins
appear in Makefile.in::PLUGIN_HEADERS so that they get installed in a
place where plugins can find them.



stupid question perhaps, but aren't most  header files a potential
plugin header?Why don't we just install them all...
I think that's basically what's happened in the past, we just installed 
everything, or close to everything.


One way to find out would be to look at the set of .h files from 
gcc-4.8/gcc and look at what ultimately ends up in PLUGIN_HEADERS.  I 
bet they're pretty damn close :-)




  No one has complained yet, but in theory any .h I split up over the
past couple of months has the potential to be required... maintaining
that macro in Makefile.in seems kinda lame now that we don't maintain
the macros for building.  I'm sure its rotted already.
I wouldn't expect much fallout until after we started putting release 
candidates out there.  That doesn't mean we should wait until then to 
address the problem though ;-)


Jeff


Re: [fortran, patch] Add Fortran 2003 IEEE intrinsic modules

2013-11-21 Thread Uros Bizjak
On Thu, Nov 21, 2013 at 1:19 PM, Uros Bizjak ubiz...@gmail.com wrote:

 Here’s my patch submission for adding IEEE intrinsic modules (following 
 Fortran 2003 and 2008
 standards) to gfortran. It implements the item 1, and part of item 2, of my 
 initial plan [1]. All the
 IEEE modules, types, named constants, procedures are defined and fully 
 working. The patch
 comes of course with plenty of testcases, and I can add some more if you can 
 think of things I’ve
  forgotten. I’ve bootstrapped and regtested the patch on:

   __asm__ __volatile__ (fnclex\n\tfldcw\t%0 : : m (cw));

 @@ -136,16 +165,54 @@ set_fpu (void)
__asm__ __volatile__ (%vstmxcsr\t%0 : =m (cw_sse));

/* The SSE exception masks are shifted by 7 bits.  */
 -  cw_sse |= _FPU_MASK_ALL  7;
 -  cw_sse = ~(excepts  7);
 -
 -  /* Clear stalled exception flags.  */
 -  cw_sse = ~_FPU_EX_ALL;

 You have to clear stalled SSE exceptions here. Their flags are in LSB
 bits, so their position is different than the position of exception
 mask bits in the control word.

 +  /* Change the flags. This is tricky on 387 (unlike SSE), because we have
 + FNSTSW but no FLDSW instruction.  */
 +  __asm__ __volatile__ (fnstenv\t%0 : =m (*temp));
 +
 +  temp.__status_word = ~exc_clr;
 +  temp.__status_word |= exc_set;
 +
 +  __asm__ __volatile__ (fldenv\t%0 : : m (*temp));

 Why do you need * here?

 fldenv will also trigger exceptions with set flags on the next x87 FP insn ...

 +__asm__ __volatile__ (%vstmxcsr\t%0 : =m (cw_sse));
 +
 +cw_sse = ~exc_clr;
 +cw_sse |= exc_set;
 +
 +__asm__ __volatile__ (%vldmxcsr\t%0 : : m (cw_sse));

 ... and ldmxcsr won't trigger exceptions, neither with SSE insn.
 Please see Intel documentation on FP exceptions.

Also, it is not clear to me, if the intention of the function is to
throw an exception? in this case, you should look at the code in
config/x86/fenv.c from libatomic (or config/i386/sfp-exceptions.c from
libgcc) for how exceptions should be generated.

Uros.


Re: [PATCH] Builtins handling in IVOPT

2013-11-21 Thread Richard Biener
On Thu, Nov 21, 2013 at 8:26 AM, Wei Mi w...@google.com wrote:
 Hi,

 This patch works on the intrinsic calls handling issue in IVOPT mentioned 
 here:
 http://gcc.gnu.org/ml/gcc-patches/2010-10/msg01295.html

 In find_interesting_uses_stmt, it changes

 arg = expr
 __builtin_xxx (arg)

 to

 arg = expr;
 tmp = addr_expr (mem_ref(arg));
 __builtin_xxx (tmp, ...)

 So mem_ref(arg) could be handled by find_interesting_uses_address, and
 an iv use of USE_ADDRESS type will be generated for expr, then a TMR
 will be generated for expr in rewrite_use_address. Expand pass is
 changed accordingly to keep the complex addressing mode not to be
 splitted for cse.

 With the patch we can handle the motivational case below.

 #include x86intrin.h
 extern __m128i arr[], d[];
 void test (void)
 {
 unsigned int b;
 for (b = 0; b  1000; b += 2) {
   __m128i *p = (__m128i *)(d[b]);
   __m128i a = _mm_load_si128(arr[4*b+3]);
   __m128i v = _mm_loadu_si128(p);
   v = _mm_xor_si128(v, a);
   _mm_storeu_si128(p, v);
 }
 }

 gcc-r204792 Without the patch:
 .L2:
 movdqu  (%rax), %xmm0
 subq$-128, %rdx
 addq$32, %rax
 pxor-128(%rdx), %xmm0
 movups  %xmm0, -32(%rax)
 cmpq$arr+64048, %rdx
 jne .L2

 gcc-r204792 With the patch:
 .L2:
 movdqu  d(%rax), %xmm0
 pxorarr+48(,%rax,4), %xmm0
 addq$32, %rax
 movups  %xmm0, d-32(%rax)
 cmpq$16000, %rax
 jne .L2

 Following things to be addressed later:
 1. TER needs to be extended to handle the case when TMR is csed.

 2. For more complex cases to work, besides this patch, we also need to
 tune the AVG_LOOP_NITER, which is now set to 5, and it limits
 induction variables merging a lot. Increasing the parameter to a
 larger one could remove some induction variable in critical loop in
 some our benchmarks. reg pressure estimation may also need to be
 tuned. I will address it in a separate patch.

 3. Now the information about which param of a builtin is of memory
 reference type is simply listed as a switch-case in
 builtin_has_mem_ref_p and ix86_builtin_has_mem_ref_p. This is not
 ideal but there is no infrastructure to describe it in existing
 implementation. More detailed information such as parameter and call
 side-effect will be important for more precise alias and may worth
 some work. Maybe the refinement about this patch could be done after
 that.

 regression and bootstrap pass on x86_64-linux-gnu. ok for trunk?

So what you are doing is basically not only rewriting memory references
to possibly use TARGET_MEM_REF but also address uses to use
TARGET_MEM_REF.  I think this is a good thing in general
(given instructions like x86 lea) and I would not bother distinguishing
the different kind of uses.

Richard.

 Thanks,
 Wei.

 2013-11-20  Wei Mi  w...@google.com

 * expr.c (expand_expr_addr_expr_1): Not to split TMR.
 (expand_expr_real_1): Ditto.
 * targhooks.c (default_builtin_has_mem_ref_p): Default
 builtin.
 * tree-ssa-loop-ivopts.c (struct iv): Add field builtin_mem_param.
 (alloc_iv): Ditto.
 (remove_unused_ivs): Ditto.
 (builtin_has_mem_ref_p): New function.
 (find_interesting_uses_stmt): Special handling of builtins.
 * gimple-expr.c (is_gimple_address): Add handling of TMR.
 * gimple-expr.h (is_gimple_addressable): Ditto.
 * config/i386/i386.c (ix86_builtin_has_mem_ref_p): New target hook.
 (ix86_atomic_assign_expand_fenv): Ditto.
 (ix86_expand_special_args_builtin): Special handling of TMR for
 builtin.
 * target.def (builtin_has_mem_ref_p): New hook.
 * doc/tm.texi.in: Ditto.
 * doc/tm.texi: Generated.

 2013-11-20  Wei Mi  w...@google.com

 * gcc.dg/tree-ssa/ivopt_5.c: New test.

 Index: expr.c
 ===
 --- expr.c  (revision 204792)
 +++ expr.c  (working copy)
 @@ -7467,7 +7467,19 @@ expand_expr_addr_expr_1 (tree exp, rtx t
   tem = fold_build_pointer_plus (tem, TREE_OPERAND (exp, 1));
 return expand_expr (tem, target, tmode, modifier);
}
 +case TARGET_MEM_REF:
 +  {
 +   int old_cse_not_expected;
 +   addr_space_t as
 + = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (TREE_OPERAND (exp, 0;

 +   result = addr_for_mem_ref (exp, as, true);
 +   old_cse_not_expected = cse_not_expected;
 +   cse_not_expected = true;
 +   result = memory_address_addr_space (tmode, result, as);
 +   cse_not_expected = old_cse_not_expected;
 +return result;
 +  }
  case CONST_DECL:
/* Expand the initializer like constants above.  */
result = XEXP (expand_expr_constant (DECL_INITIAL (exp),
 @@ -9526,9 +9538,13 @@ expand_expr_real_1 (tree exp, rtx target
   = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (TREE_OPERAND 

[patch 1/3] Flatten gimple.h

2013-11-21 Thread Andrew MacLeod

The final gimple re-org patch!

These patches move the #includes out of gimple.h and into the .c files 
which include gimple.h.  They are:


#include pointer-set.h
#include hash-table.h
#include vec.h
#include ggc.h
#include basic-block.h
#include tree-ssa-alias.h
#include internal-fn.h
#include gimple-fold.h
#include tree-eh.h
#include gimple-expr.h
#include is-a.h

The first patch is the core changes, plus the #include changes in the 
core compiler directory.


I moved recalculate_side_effects() from gimple.c to gimplify.c. 
gimplify.c was the primary consumer, and the only other caller was in 
the ada front end.  By moving it, the ada front end doesn't need 
gimple.h any more.


I also trimmed out any  .h file introduced which the .c file does not 
require.  None of these specific .h files appear to have any ordering 
issues, so this should be safe.


The second patch has the language front end changes. its only the 
${language}-gimplify.c files which now actually require gimple.h itself 
(yay).  I also trimmed out the #includes which were introduced but not 
required there as well.


The third patch has the config/*  target changes, as well as a few 
testcases.  I did *not*  trim out  includes for the targets since I got 
caught earlier with targets requiring some files on only some 
configurations.  I did go through an remove any duplicates that were 
introduced tho. (ie, if basic-block.h was already being included, I 
removed the duplicate which flattening introduced.) I didn't try 
reducing the includes in the testcases since that doesn't really matter.


I did run make all-gcc on all the targets in config-list.mk, and don't 
appear to have caused any new failures... the failures that are there 
appear to be unrelated to compilation of the changed files.


Is there anything in particular one needs to do for plugins? I thought I 
saw a patch somewhere that changed something in the Makefile, but don't 
know if that is actually required since I never did that for any of the 
others.   Any plugin which used gimple.h probably needs a few more 
includes...


This bootstraps on x86_64-unknown-linux-gnu, and regressions are 
currently running.  Assuming it passes fine, OK?


Andrew


gflat1.patch.gz
Description: application/gzip


Re: [wide-int] Undo some differences with trunk

2013-11-21 Thread Richard Sandiford
Mike Stump mikest...@comcast.net writes:
 On Nov 20, 2013, at 5:58 AM, Richard Sandiford
 rsand...@linux.vnet.ibm.com wrote:
 I've committed a patch upstream to convert TREE_INT_CST_LOW to 
 tree_to_[su]hwi
 if there is an obvious tree_fits_[su]hwi_p guard.

 So, in general, I like putting them on trunk, and then merging them into
 the branch.  When that is done, the difference disappears without the
 removal of the difference.  If one misses one, then it does't magically
 disappear.  If one changes it in a manner that is different than the
 branch, the change then shows up as a conflict to resolve.  Also, when
 that is done, we can then just see the actual changes you are making,
 instead of those intermixed into random changes that made it to trunk
 and random ones that did not.

I don't get what you're trying to say.  I _did_ put the changes I want
to keep on trunk, and merged them back to the branch so that the remaining
TREE_INT_CST_LOW-tree_to_* changes stood out.

The remaining branch changes weren't obvious without some justification
and are independent of what we're doing in wide-int.  Which is why I asked
permission to revert them on the branch and leave them to be handled 
separately.  As I said, the patch is still here, so it's not like the
work is lost.

 The patch also undoes a couple of other ordering and whitespace differences.

 So, for example, these changes are just dropped on the floor.  :-( I
 picked them up, and put them into trunk.  I can't tell which other
 patches are also so dropped that I might think are important.

That's why I'm posting the patches for review, so you can see and comment.

Thanks,
Richard



Re: [PATCH, i386, MPX, 2/X] Pointers Checker [22/25] Target builtins

2013-11-21 Thread Uros Bizjak
Hello!

 2013-11-15  Ilya Enkovich  ilya.enkov...@intel.com

 * config/i386/i386-builtin-types.def (BND): New.
 (ULONG): New.
 (BND_FTYPE_PCVOID_ULONG): New.
 (VOID_FTYPE_BND_PCVOID): New.
 (VOID_FTYPE_PCVOID_PCVOID_BND): New.
 (BND_FTYPE_PCVOID_PCVOID): New.
 (BND_FTYPE_PCVOID): New.
 (BND_FTYPE_BND_BND): New.
 (PVOID_FTYPE_PVOID_PVOID_ULONG): New.
 (PVOID_FTYPE_PCVOID_BND_ULONG): New.
 (ULONG_FTYPE_VOID): New.
 (PVOID_FTYPE_BND): New.
 * config/i386/i386.c: Include tree-chkp.h, rtl-chkp.h.
 (ix86_builtins): Add
 IX86_BUILTIN_BNDMK, IX86_BUILTIN_BNDSTX,
 IX86_BUILTIN_BNDLDX, IX86_BUILTIN_BNDCL,
 IX86_BUILTIN_BNDCU, IX86_BUILTIN_BNDRET,
 IX86_BUILTIN_BNDSET, IX86_BUILTIN_BNDNARROW,
 IX86_BUILTIN_BNDINT, IX86_BUILTIN_ARG_BND,
 IX86_BUILTIN_SIZEOF, IX86_BUILTIN_BNDLOWER,
 IX86_BUILTIN_BNDUPPER.
 (builtin_isa): Add leaf_p and nothrow_p fields.
 (def_builtin): Initialize leaf_p and nothrow_p.
 (ix86_add_new_builtins): Handle leaf_p and nothrow_p
 flags.
 (bdesc_mpx): New.
 (bdesc_mpx_const): New.
 (ix86_init_mpx_builtins): New.
 (ix86_init_builtins): Call ix86_init_mpx_builtins.
 (ix86_expand_builtin): expand IX86_BUILTIN_BNDMK,
 IX86_BUILTIN_BNDSTX, IX86_BUILTIN_BNDLDX,
 IX86_BUILTIN_BNDCL, IX86_BUILTIN_BNDCU,
 IX86_BUILTIN_BNDRET, IX86_BUILTIN_BNDSET,
 IX86_BUILTIN_BNDNARROW, IX86_BUILTIN_BNDINT,
 IX86_BUILTIN_ARG_BND, IX86_BUILTIN_SIZEOF,
 IX86_BUILTIN_BNDLOWER, IX86_BUILTIN_BNDUPPER.

+  if (decl)
+ {
+  DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier (leaf),
+NULL_TREE);
+  TREE_NOTHROW (decl) = 1;
+ }
+  else
+ {
+  ix86_builtins_isa[(int)d-code].leaf_p = true;
+  ix86_builtins_isa[(int)d-code].nothrow_p = true;
+ }

Is there a reason these builtins need to be leaf and nothrow? It is
not clear from the source what is the reason. I'd suggest a comment
explaining this fact.

+  /* Builtin arg1 is size of block but instruction op1 should be
(size - 1).  */
+  op0 = expand_normal (arg0);
+  op1 = expand_normal (fold_build2 (PLUS_EXPR, TREE_TYPE (arg1),
+ arg1, integer_minus_one_node));
+  op0 = force_reg (Pmode, op0);
+  op1 = force_reg (Pmode, op1);

A tree expert should say if the above approach of operand fixups in
the patch is OK.

+  /* Avoid registers which connot be used as index.  */
+  if (REGNO (op1) == VIRTUAL_INCOMING_ARGS_REGNUM
+  || REGNO (op1) == VIRTUAL_STACK_VARS_REGNUM
+  || REGNO (op1) == VIRTUAL_OUTGOING_ARGS_REGNUM)

As said elsewhere, you want to check with index_register_operand predicate here.

+  /* If op1 was a register originally then it may have
+ mode other than Pmode.  We need to extend in such
+ case because bndldx may work only with Pmode regs.  */
+  if (GET_MODE (op1) != Pmode)
+ {
+  rtx ext = gen_rtx_ZERO_EXTEND (Pmode, op1);
+  op1 = gen_reg_rtx (Pmode);
+  emit_move_insn (op1, ext);
+ }

Please use ix86_zero_extend_to_Pmode.

+case IX86_BUILTIN_BNDNARROW:
+  {
+ enum machine_mode mode = BNDmode;
+ enum machine_mode hmode = Pmode;

No need for static temporaries, use mode directly.

+ /* Generate mem expression to be used for access to LB and UB.  */
+ m1h1 = gen_rtx_MEM (hmode, XEXP (m1, 0));
+ m1h2 = gen_rtx_MEM (hmode,
+gen_rtx_PLUS (Pmode, XEXP (m1, 0),
+  GEN_INT (GET_MODE_SIZE (hmode;

Use subreg infrastructure here?

+ if (TARGET_CMOVE)
+  {
+t2 = ix86_expand_compare (LTU, t1, lb);
+emit_insn (gen_rtx_SET (VOIDmode, t1,
+gen_rtx_IF_THEN_ELSE (hmode, t2, lb, t1)));
+  }
+ else
+  {
+rtx nomove = gen_label_rtx ();
+emit_cmp_and_jump_insns (t1, lb, GEU, const0_rtx, hmode, 1, nomove);
+emit_insn (gen_rtx_SET (VOIDmode, t1, lb));
+emit_label (nomove);
+  }
+ emit_move_insn (m1h1, t1);

This is used in a couple of places. Perhaps make a helper function?

+m1 = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);

Please use assign_i386_stack_local (mode, SLOT_TEMP)

+ m1h1 = gen_rtx_MEM (hmode, XEXP (m1, 0));
+ m1h2 = gen_rtx_MEM (hmode,
+gen_rtx_PLUS (Pmode, XEXP (m1, 0),
+  GEN_INT (GET_MODE_SIZE (hmode;
+ m2h1 = gen_rtx_MEM (hmode, XEXP (m2, 0));
+ m2h2 = gen_rtx_MEM (hmode,
+gen_rtx_PLUS (Pmode, XEXP (m2, 0),
+  GEN_INT (GET_MODE_SIZE (hmode;
+ rh1 = gen_rtx_MEM (hmode, XEXP (res, 0));
+ rh2 = gen_rtx_MEM (hmode,
+   gen_rtx_PLUS (Pmode, XEXP (res, 0),
+ GEN_INT (GET_MODE_SIZE (hmode;

subergs, or plus_constant.

Uros.


Re: [PATCH] Builtins handling in IVOPT

2013-11-21 Thread Richard Biener
Wei Mi w...@google.com wrote:
On Thu, Nov 21, 2013 at 11:36 AM, Richard Biener
richard.guent...@gmail.com wrote:
 Wei Mi w...@google.com wrote:
 So what you are doing is basically not only rewriting memory
references
 to possibly use TARGET_MEM_REF but also address uses to use
 TARGET_MEM_REF.  I think this is a good thing in general
 (given instructions like x86 lea) and I would not bother
distinguishing
 the different kind of uses.

 Richard.


You mean to change normal expr to TMR(expr) form in order to utilize
x86 lea type instructions as much as possible. It is interesting. I
can experiment that idea later. I am not sure if it could simply
work.
My concern is x86 lea still has some limitation (such as three
operands lea will have longer latency and can only be issued to
port1), if we change some expr to TMR(expr), will it inhitbit cse
opportunity if codegen find out it is not good to use lea?

 That needs to be determined.  Over all it might be because ivopts
runs so early.  At rtl level there should not be big differences apart
from better initial address computations.

 Did I misunderstand what your patch does?

 Richard.


My patch wants to address the issue that iv uses using as memory
reference actuals for load/store/prefetch builtins are treated as
non-linear iv uses instead of address iv uses, and the result of
determine_use_iv_cost is wrong. After we change those uses to address
uses, less ivs may be used, TMR will be generated for those iv uses
and efficent addressing mode could be utilized.

But are not all pointer typed uses address uses?!

Richard.

Thanks,
Wei.




Re: Implement C11 _Atomic

2013-11-21 Thread Joseph S. Myers
On Thu, 21 Nov 2013, Hans-Peter Nilsson wrote:

 Oh right, gcc still doesn't remove target-introduced manual
 alignment checks (when expanding atomic intrinsics), but at
 least gcc makes sure it's aligned on stack, when options doesn't
 say it's aligned.  And a.c:plugh2 doesn't seem to perform an
 atomic assignment, but just assignment through an
 _Atomic-aligned stack temporary.  Might be my C11-ignorance
 showing.

It appears to me on x86_64 to produce an __atomic_store_4 to *x (in the 
GIMPLE dumps, what happens after that is a matter for the back end).

Note that atomic variable initialization is *not* atomic (see 7.17.2.1 - 
in general ATOMIC_VAR_INIT needs using with the initializer, or the 
initialization needs to be carried out with atomic_init, though GCC 
doesn't require that).  (In C11, the effect of a plain initialization 
without ATOMIC_VAR_INIT is I think that the initializer is evaluated for 
its side effects, but if the variable gets used as either rvalue or lvalue 
without one of the special forms of initialization being used first then 
the behavior is undefined.  The idea is to support implementations with 
extra bits in atomic objects used for locking purposes.)  So no atomic 
store to y is expected - although there are atomic loads from y.

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: _Cilk_spawn and _Cilk_sync for C++

2013-11-21 Thread Jason Merrill

On 11/17/2013 10:19 PM, Iyer, Balaji V wrote:

   cp/cp-cilkplus.o \
- cp/cp-gimplify.o cp/cp-array-notation.o cp/lambda.o \
+ cp/cp-gimplify.o cp/cp-array-notation.o cp/lambda.o cp/cp-cilk.o \


It seems unnecessary to have both cp-cilk.c and cp-cilkplus.c.  Please 
combine them.



+  extern tree do_begin_catch (void);
+  extern tree do_end_catch (tree);


If you want to use these, they need to be declared in cp-tree.h, not 
within another function.  Or better yet, factor out this code:



+  append_to_statement_list (do_begin_catch (), catch_list);
+  append_to_statement_list (build_throw (NULL_TREE), catch_list);
+  tree catch_tf_expr = build_stmt (EXPR_LOCATION (body), TRY_FINALLY_EXPR,
+  catch_list, do_end_catch (NULL_TREE));
+  catch_list = build2 (CATCH_EXPR, void_type_node, NULL_TREE,
+  catch_tf_expr);
+  tree try_catch_expr = build_stmt (EXPR_LOCATION (body), TRY_CATCH_EXPR,
+   body, catch_list);


...into a function in cp/except.c.


+  tree try_finally_expr = build_stmt (EXPR_LOCATION (body),
+ TRY_FINALLY_EXPR,
+try_catch_expr, dtor);
+  append_to_statement_list (try_finally_expr, list);
+}
+  else
+append_to_statement_list (build_stmt (EXPR_LOCATION (body),
+ TRY_FINALLY_EXPR, body, dtor), list);


This bit could be shared between the two branches.


+  /* When Cilk Plus is enabled, the lambda function need to be stored to
+ a variable because if the function is spawned, then we need some kind
+ of a handle.  */
+  if (flag_enable_cilkplus  cxx_dialect = cxx0x
+   TREE_CODE (fn) != VAR_DECL  TREE_CODE (fn) != OVERLOAD
+   TREE_CODE (fn) != FUNCTION_DECL)
+fn = cilk_create_lambda_fn_tmp_var (fn);


I don't like making this change here.  What do you need a handle for? 
Why can't build_cilk_spawn deal with it?



+case CILK_SPAWN_STMT:
+  if (!potential_constant_expression_1 (CILK_SPAWN_FN (t), true, flags))
+   return false;
+  return true;


Isn't Cilk spawn itself is non-constant, so you can just return false?

Jason



Re: three problems with stor-layout.c.

2013-11-21 Thread Richard Biener
On Wed, 20 Nov 2013, Kenneth Zadeck wrote:

 Richi,
 
 We noticed three problems with the place_field on the wide-int branch.They
 come from problems on the trunk.   So i assume you want me to fix the trunk
 and push back into the branch.   The question is how do you want them fixed?
 
 1) at line 1198, rli-offset is tested as an unsigned int and used as a signed
 int on line 1203.
 2) The same mistake is made on lines 1241 and 1247.
 3) at line 1303, TYPE_SIZE (type) is tested as a signed int and used as an
 unsigned int on line 1313.
 
 They can be fixed by you saying if they are really signed or unsigned or we
 can just convert them to double-int and push the change as addr wide-ints to
 the branch.

I think most of these are because in the past (yes I have fixed that!!)
all 'sizetype' constants were sign-extended (and the signedness,
that is, TYPE_UNSIGNED (sizetype), was frontend dependend (ugh) and
then later true, thus unsigned).

So I think all _SIZE stuff should check fits_uhwi_p and be used as
uhwi.  But that may have ripple-down effects, so consistently
using fits_shwi_p and using as shwi is also fine (it just restricts
the maximum values we accept(?)).

Richard.


[PATCH, rs6000] More vector LE cleanups

2013-11-21 Thread Bill Schmidt
Hi,

This patch fixes two issues to allow correct compilation of
gcc.dg/torture/vec-cvt-1.c in little endian mode.  The first reverts a
change in three patterns in vector.md.  This is from an early patch that
preceded the general fix for vector permutes.  As a consequence we ended
up swapping the input arguments twice.  So we can simplify the code here
and have it operate the same for big and little endian.

The other issue corrects a scenario where I managed to check for
endianness twice, with the effect that the code acts the same for both
big and little endian when it shouldn't.

Bootstrapped and tested on powerpc64{,le}-unknown-linux-gnu with no
regressions.  Is this ok for trunk?

Thanks,
Bill


2013-11-21  Bill Schmidt  wschm...@vnet.ibm.com

* config/rs6000/vector.md (vec_pack_trunc_v2df): Revert previous
little endian change.
(vec_pack_sfix_trunc_v2df): Likewise.
(vec_pack_ufix_trunc_v2df): Likewise.
* config/rs6000/rs6000.c (rs6000_expand_interleave): Correct
double checking of endianness.


Index: gcc/config/rs6000/vector.md
===
--- gcc/config/rs6000/vector.md (revision 205145)
+++ gcc/config/rs6000/vector.md (working copy)
@@ -831,12 +831,7 @@
 
   emit_insn (gen_vsx_xvcvdpsp (r1, operands[1]));
   emit_insn (gen_vsx_xvcvdpsp (r2, operands[2]));
-
-  if (BYTES_BIG_ENDIAN)
-rs6000_expand_extract_even (operands[0], r1, r2);
-  else
-rs6000_expand_extract_even (operands[0], r2, r1);
-
+  rs6000_expand_extract_even (operands[0], r1, r2);
   DONE;
 })
 
@@ -851,12 +846,7 @@
 
   emit_insn (gen_vsx_xvcvdpsxws (r1, operands[1]));
   emit_insn (gen_vsx_xvcvdpsxws (r2, operands[2]));
-
-  if (BYTES_BIG_ENDIAN)
-rs6000_expand_extract_even (operands[0], r1, r2);
-  else
-rs6000_expand_extract_even (operands[0], r2, r1);
-
+  rs6000_expand_extract_even (operands[0], r1, r2);
   DONE;
 })
 
@@ -871,12 +861,7 @@
 
   emit_insn (gen_vsx_xvcvdpuxws (r1, operands[1]));
   emit_insn (gen_vsx_xvcvdpuxws (r2, operands[2]));
-
-  if (BYTES_BIG_ENDIAN)
-rs6000_expand_extract_even (operands[0], r1, r2);
-  else
-rs6000_expand_extract_even (operands[0], r2, r1);
-
+  rs6000_expand_extract_even (operands[0], r1, r2);
   DONE;
 })
 
Index: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c  (revision 205145)
+++ gcc/config/rs6000/rs6000.c  (working copy)
@@ -30116,7 +30116,7 @@ rs6000_expand_interleave (rtx target, rtx op0, rtx
   unsigned i, high, nelt = GET_MODE_NUNITS (vmode);
   rtx perm[16];
 
-  high = (highp == BYTES_BIG_ENDIAN ? 0 : nelt / 2);
+  high = (highp ? 0 : nelt / 2);
   for (i = 0; i  nelt / 2; i++)
 {
   perm[i * 2] = GEN_INT (i + high);




Re: [fortran, patch] Add Fortran 2003 IEEE intrinsic modules

2013-11-21 Thread Uros Bizjak
Hello!

 However, it is used in the form of selecting hard underflow using a
 compilation option, and not within the program.  You certainly DO have
 targets where it would work, even dynamically within the program, and I
 think that it could be done even on x86.  That isn't the same as it
 should be done, of course!

 Indeed, 387/SSE has flush-to-zero modes. But other APIs do not (glibc, SysV, 
 AIX).
 I’m perfectly willing to add it, especially to 387/SSE, if given a bit of 
 help (someone to write the
 assembly code).

Just set FTZ bit in mxcsr. Please see
libgcc/config/i386/crtfastmath.c, this mode is used when -ffast-math
is used.

Uros.


Re: [PATCH] Updated automated patch (was Re: [PATCH 3/6] Automated part of conversion of gimple types to use C++ inheritance)

2013-11-21 Thread David Malcolm
On Thu, 2013-11-21 at 18:03 -0500, Andrew MacLeod wrote:
 On 11/21/2013 05:42 PM, Jakub Jelinek wrote:
  On Thu, Nov 21, 2013 at 03:24:55PM -0700, Jeff Law wrote:
  On 11/21/13 15:19, Jakub Jelinek wrote:
  On Mon, Nov 18, 2013 at 03:25:52PM -0500, David Malcolm wrote:
  So is there some reason the GIMPLE_CHECK was left in here rather than
  doing the downcasting?  This happens in other places.
  Note that the changes removed tons of checks that IMHO were desirable.
  The as_a that replaced those checks e.g. allows 3 different gimple codes,
  while previously only one was allowed, this is both more expensive for
  --enable-checking=yes, and allows one to use inline wrappers e.g.
  gimple_omp_parallel_something on GIMPLE_OMP_TASK etc.
  Can you give a couple examples, please?
  I mean e.g.
  gimple_omp_parallel_{,set_}{clauses,child_fn,data_arg}{,_ptr}
  gimple_omp_taskreg_{,set_}{clauses,child_fn,data_arg}{,_ptr}
  gimple_omp_target_{,set_}{clauses,child_fn,data_arg}{,_ptr}
  gimple_omp_teams_{,set_}clauses{,_ptr}
  gimple_omp_return_{,set_}lhs{,_ptr}
  gimple_omp_atomic_store_{,set_}val{,_ptr}
  gimple_resx_{,set_}region
  gimple_eh_dispatch_{,set_}region
 
  Jakub
 Why does  is_a_helper gimple_statement_omp_parallel::test allow 
 anything other than a GIMPLE_OMP_PARALLEL..?  That seems wrong to me. 
 should just be the one check.
 
 gimple_omp_taskreg and other routines sharing that helper should have 
 their own helper and only check the one code.. thats is whole point to 
 remain at least codegen neutral in these cases and provide correct 
 checking.   The fact that they may happen to share the same underlying 
 structure is irrelevant.
 
 I also think this is wrong.

This was a bug in my script.  Sorry.  Working on a fix.



[Patch 2/4] Conform vector implementation to ABI -- lane set and get.

2013-11-21 Thread Tejas Belagod

Hi,

This patch fixes up the lane access patterns to be symmetric to the order in 
which vectors are stored in registers.


Tested for aarch64-none-elf and aarch64_be-none-elf. OK for trunk?

Thanks,
Tejas Belagod
ARM.

2013-11-21  Tejas Belagod  tejas.bela...@arm.com

gcc/
* config/aarch64/aarch64-simd.md (aarch64_simd_vec_setmode): Adjust
for big-endian element order.
(aarch64_simd_vec_setv2di): Likewise.
(*aarch64_get_lane_extendGPI:modeVDQQH:mode,
*aarch64_get_lane_zero_extendsimode, aarch64_get_lane): Likewise.
(vec_extract): Expand using aarch64_get_lane.
* config/aarch64/aarch64.h (ENDIAN_LANE_N): New.diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index b9ebdf5..b4a0a5b 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -428,9 +428,19 @@
(match_operand:VQ_S 3 register_operand 0,0)
(match_operand:SI 2 immediate_operand i,i)))]
   TARGET_SIMD
-  @
-   ins\t%0.Vetype[%p2], %w1
-   ins\\t%0.Vetype[%p2], %1.Vetype[0]
+  {
+   int elt = ENDIAN_LANE_N (MODEmode, exact_log2 (INTVAL (operands[2])));
+   operands[2] = GEN_INT ((HOST_WIDE_INT) 1  elt);
+   switch (which_alternative)
+ {
+ case 0:
+   return ins\\t%0.Vetype[%p2], %w1;
+ case 1:
+   return ins\\t%0.Vetype[%p2], %1.Vetype[0];
+ default:
+   gcc_unreachable ();
+ }
+  }
   [(set_attr type neon_from_gpq, neon_insq)]
 )
 
@@ -692,9 +702,19 @@
(match_operand:V2DI 3 register_operand 0,0)
(match_operand:SI 2 immediate_operand i,i)))]
   TARGET_SIMD
-  @
-   ins\t%0.d[%p2], %1
-   ins\\t%0.d[%p2], %1.d[0]
+  {
+int elt = ENDIAN_LANE_N (V2DImode, exact_log2 (INTVAL (operands[2])));
+operands[2] = GEN_INT ((HOST_WIDE_INT) 1  elt);
+switch (which_alternative)
+  {
+  case 0:
+   return ins\\t%0.d[%p2], %1;
+  case 1:
+return ins\\t%0.d[%p2], %1.d[0];
+  default:
+   gcc_unreachable ();
+  }
+  }
   [(set_attr type neon_from_gp, neon_ins_q)]
 )
 
@@ -719,7 +739,12 @@
(match_operand:VDQF 3 register_operand 0)
(match_operand:SI 2 immediate_operand i)))]
   TARGET_SIMD
-  ins\t%0.Vetype[%p2], %1.Vetype[0];
+  {
+int elt = ENDIAN_LANE_N (MODEmode, exact_log2 (INTVAL (operands[2])));
+
+operands[2] = GEN_INT ((HOST_WIDE_INT)1  elt);
+return ins\t%0.Vetype[%p2], %1.Vetype[0];
+  }
   [(set_attr type neon_insq)]
 )
 
@@ -2022,7 +2047,10 @@
(match_operand:VDQQH 1 register_operand w)
(parallel [(match_operand:SI 2 immediate_operand i)]]
   TARGET_SIMD
-  smov\\t%GPI:w0, %1.VDQQH:Vetype[%2]
+  {
+operands[2] = GEN_INT (ENDIAN_LANE_N (MODEmode, INTVAL (operands[2])));
+return smov\\t%GPI:w0, %1.VDQQH:Vetype[%2];
+  }
   [(set_attr type neon_to_gpq)]
 )
 
@@ -2033,22 +2061,36 @@
(match_operand:VDQQH 1 register_operand w)
(parallel [(match_operand:SI 2 immediate_operand i)]]
   TARGET_SIMD
-  umov\\t%w0, %1.Vetype[%2]
+  {
+operands[2] = GEN_INT (ENDIAN_LANE_N (MODEmode, INTVAL (operands[2])));
+return umov\\t%w0, %1.Vetype[%2];
+  }
   [(set_attr type neon_to_gpq)]
 )
 
 ;; Lane extraction of a value, neither sign nor zero extension
 ;; is guaranteed so upper bits should be considered undefined.
 (define_insn aarch64_get_lanemode
-  [(set (match_operand:VEL 0 register_operand =r, w)
+  [(set (match_operand:VEL 0 register_operand =r, w, Utv)
(vec_select:VEL
- (match_operand:VALL 1 register_operand w, w)
- (parallel [(match_operand:SI 2 immediate_operand i, i)])))]
+ (match_operand:VALL 1 register_operand w, w, w)
+ (parallel [(match_operand:SI 2 immediate_operand i, i, i)])))]
   TARGET_SIMD
-  @
-   umov\\t%vwcore0, %1.Vetype[%2]
-   dup\\t%Vetype0, %1.Vetype[%2]
-  [(set_attr type neon_to_gpq, neon_dupq)]
+  {
+operands[2] = GEN_INT (ENDIAN_LANE_N (MODEmode, INTVAL (operands[2])));
+switch (which_alternative)
+  {
+   case 0:
+ return umov\\t%vwcore0, %1.Vetype[%2];
+   case 1:
+ return dup\\t%Vetype0, %1.Vetype[%2];
+   case 2:
+ return st1\\t{%1.Vetype}[%2], %0;
+   default:
+ gcc_unreachable ();
+  }
+  }
+  [(set_attr type neon_to_gpq, neon_dupq, neon_store1_one_laneq)]
 )
 
 (define_expand aarch64_get_lanedi
@@ -4028,16 +4070,13 @@
 
 ;; Standard pattern name vec_extractmode.
 
-(define_insn vec_extractmode
-  [(set (match_operand:VEL 0 aarch64_simd_nonimmediate_operand =r, w, 
Utv)
-   (vec_select:VEL
- (match_operand:VALL 1 register_operand w, w, w)
- (parallel [(match_operand:SI 2 immediate_operand i,i,i)])))]
+(define_expand vec_extractmode
+  [(match_operand:VEL 0 aarch64_simd_nonimmediate_operand )
+   (match_operand:VALL 1 register_operand )
+   (match_operand:SI 2 immediate_operand )]
   TARGET_SIMD
-  @
-  umov\\t%vw0, %1.Vetype[%2]
-  dup\\t%Vetype0, %1.Vetype[%2]

Re: [PATCH] Support addsub/subadd as non-isomorphic operations for SLP vectorizer.

2013-11-21 Thread Marc Glisse

On Thu, 21 Nov 2013, Cong Hou wrote:


While I added the new define_insn_and_split for vec_merge, a bug is
exposed: in config/i386/sse.md, [ define_expand xop_vmfrczmode2 ]
only takes one input, but the corresponding builtin functions have two
inputs, which are shown in i386.c:

 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2,
__builtin_ia32_vfrczss, IX86_BUILTIN_VFRCZSS, UNKNOWN,
(int)MULTI_ARG_2_SF },
 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2,
__builtin_ia32_vfrczsd, IX86_BUILTIN_VFRCZSD, UNKNOWN,
(int)MULTI_ARG_2_DF },

In consequence, the ix86_expand_multi_arg_builtin() function tries to
check two args but based on the define_expand of xop_vmfrczmode2,
the content of insn_data[CODE_FOR_xop_vmfrczv4sf2].operand[2] may be
incorrect (because it only needs one input).

The patch below fixed this issue.

Bootstrapped and tested on ax x86-64 machine. Note that this patch
should be applied before the one I sent earlier (sorry for sending
them in wrong order).


This is PR 56788. Your patch seems strange to me and I don't think it
fixes the real issue, but I'll let more knowledgeable people answer.

--
Marc Glisse


Re: [patch 1/3] Flatten gimple.h

2013-11-21 Thread Jeff Law

On 11/21/13 11:15, Andrew MacLeod wrote:


Is there anything in particular one needs to do for plugins? I thought I
saw a patch somewhere that changed something in the Makefile, but don't
know if that is actually required since I never did that for any of the
others.   Any plugin which used gimple.h probably needs a few more
includes...
We need to make sure the header files that are needed by plugins appear 
in Makefile.in::PLUGIN_HEADERS so that they get installed in a place 
where plugins can find them.





This bootstraps on x86_64-unknown-linux-gnu, and regressions are
currently running.  Assuming it passes fine, OK?

Reading ;-)

jeff



Overhaul middle-end handling of restrict

2013-11-21 Thread Michael Matz
Hello,

after much pondering about the issue we came up with this design to 
handle restrict more generally.  Without a completely different way of 
representing conflicts (or non-conflicts) of memory references we're bound 
to somehow encode the necessary information into the points-to set (or in 
any case information related to pointer ssa names).  This in turn means 
that the location sensitive nature of restrict needs to be made explicit 
in the IL, i.e. we basically need some starting point when a pointer 
becomes restrict (ADD_RESTRICT), and also an ending point (DEL_RESTRICT), 
which must act as barrier for other memory accesses (if it wouldn't some 
transformations could move references from outside restrict regions into 
the restrict region making them disasmbiguable with the restrict 
references, introducing a bug).

We also need to use our points-to solver to propagate restrict tags 
conservatively, postprocessing the information to remove too conservative 
estimates afterwards per SSA name.

And if we want to use restrict based disambiguation we also need to 
transfer the barriers into RTL barriers (I'm using an asm with an explicit 
MEM to refer to the pointer in question, so not all memory is clobbered).
There's some potential for improvement here by removing useless 
ADD_RESTRICT / DEL_RESTRICT pairs.

There's another improvement when enlargening the set of SSA names to be 
considered for postprocessin.  Right now only the result of ADD_RESTRICT 
assigns are handled, that can be improved to also process SSA names that 
trivial depend on such (i.e. are offsetted and themself restrict typed).

That facility is used to implement restrict parameters to functions, 
replacing the current ad-hoc way in the points-to solver.  Other uses 
should be done under control of the frontends, as only those know the 
semantics for real.

I have a patch that more aggressively creates ADD_RESTRICT/DEL_RESTRICT 
pairs (basically whenever there's an assignment from non-restrict pointers 
to a restrict pointer, on the grounds that this invents a new restrict 
set), but that breaks C programs that we don't want to break (I consider 
them invalid, but there's disagreement).

Some older incarnations of this were bootstrapped, but this specific patch 
is only now in regstrapping on x86_64-linux.  Okay for trunk if that 
passes?


Ciao,
Michael.
---
* tree.def (ADD_RESTRICT): New tree code.
* cfgexpand.c (expand_debug_expr): Handle it.
* expr.c (expand_pointer_clobber): New function.
(expand_expr_real_2): Use it to handle ADD_RESTRICT.
* expr.h (expand_pointer_clobber): Declare.
* function.c (gimplify_parameters): Return a second gimple_seq,
handle restrict parameters.
* function.h (gimplify_parameters): Adjust.
* gimple-pretty-print.c (dump_binary_rhs): Handle ADD_RESTRICT.
* gimplify.c (gimplify_body): Append second gimple_seq,
adjust call to gimplify_parameters.
* internal-fn.def (DEL_RESTRICT): New internal function code.
* internal-fn.c (expand_DEL_RESTRICT): New function.
* tree-cfg.c (verify_gimple_assign_binary): Check ADD_RESTRICT.
* tree-inline.c (estimate_operator_cost): Handle ADD_RESTRICT.
* tree-pretty-print.c (dump_generic_node): Ditto.
* tree-ssa-dce.c (propagate_necessity): DEL_RESTRICT calls
are only clobbers.
* tree-ssa-structalias.c (build_fake_var_decl_uid): New static
function.
(build_fake_var_decl): Rewrite in terms of above.
(make_heapvar): Take uid parameter.
(make_constraint_from_restrict_uid): New.
(make_constraint_from_restrict): Use above.
(make_constraint_from_global_restrict): Explicitely set global flag.
(handle_lhs_call): Adjust call to make_heapvar.
(find_func_aliases_for_internal_call): New.
(find_func_aliases_for_call): Use it.
(find_func_aliases): Handle ADD_RESTRICT.
(intra_create_variable_infos): Remove any explicit handling
of restrict parameters.
(set_uids_in_ptset): Update instead of overwrite 
vars_contains_escaped_heap flag.
(find_what_var_points_to_1): Renamed from ...
(find_what_var_points_to): ... this, which is now wrapper
postprocessing points-to flags.
(compute_points_to_sets): Ignore DEL_RESTRICT calls.

Index: cfgexpand.c
===
--- cfgexpand.c (revision 205123)
+++ cfgexpand.c (working copy)
@@ -3785,6 +3785,7 @@ expand_debug_expr (tree exp)
   /* Fall through.  */
 
 adjust_mode:
+case ADD_RESTRICT:
 case PAREN_EXPR:
 case NOP_EXPR:
 case CONVERT_EXPR:
Index: expr.c
===
--- expr.c  (revision 205123)
+++ expr.c  (working copy)
@@ -7988,6 +7988,41 @@ expand_cond_expr_using_cmove (tree treeo
   

Re: [PATCH] Builtins handling in IVOPT

2013-11-21 Thread Wei Mi
On Thu, Nov 21, 2013 at 11:36 AM, Richard Biener
richard.guent...@gmail.com wrote:
 Wei Mi w...@google.com wrote:
 So what you are doing is basically not only rewriting memory
references
 to possibly use TARGET_MEM_REF but also address uses to use
 TARGET_MEM_REF.  I think this is a good thing in general
 (given instructions like x86 lea) and I would not bother
distinguishing
 the different kind of uses.

 Richard.


You mean to change normal expr to TMR(expr) form in order to utilize
x86 lea type instructions as much as possible. It is interesting. I
can experiment that idea later. I am not sure if it could simply work.
My concern is x86 lea still has some limitation (such as three
operands lea will have longer latency and can only be issued to
port1), if we change some expr to TMR(expr), will it inhitbit cse
opportunity if codegen find out it is not good to use lea?

 That needs to be determined.  Over all it might be because ivopts runs so 
 early.  At rtl level there should not be big differences apart from better 
 initial address computations.

 Did I misunderstand what your patch does?

 Richard.


My patch wants to address the issue that iv uses using as memory
reference actuals for load/store/prefetch builtins are treated as
non-linear iv uses instead of address iv uses, and the result of
determine_use_iv_cost is wrong. After we change those uses to address
uses, less ivs may be used, TMR will be generated for those iv uses
and efficent addressing mode could be utilized.

Thanks,
Wei.


Re: Enale -fno-fat-lto-objects by default

2013-11-21 Thread Jan Hubicka
 On 11/19/2013 02:54 PM, Jan Hubicka wrote:
 
 The problem is that you have .a library consisting of slim LTO objects and 
 you link
 with it during configure check without -flto.
 
 On the other hand, many configure checks will never work reliably
 with -flto because they rely heavily on the fact that GCC cannot see
 across translation units and spot crass declaration/definition
 mismatches.

Hmm, you are right. Probably -fuse-linker-plugin is most sensible solution here
(though it sucks).
I wonder what people would think about enabling plugin by default for 4.9 and 
teaching
it to do nothing without -flto on fat objects?

Honza


Re: Use MPFR for real.c decimal-string-to-binary conversions (PR 21718)

2013-11-21 Thread Richard Biener
On Wed, Nov 20, 2013 at 7:17 PM, Joseph S. Myers
jos...@codesourcery.com wrote:
 On Wed, 20 Nov 2013, Richard Biener wrote:

 I suggest to remove real_sqrt and the only use in simplify-rtx.c instead
 (or fix it to use MPFR as well - your choice).

 This patch removes real_sqrt.  (I rather hope that in general little
 if any floating-point constant folding is happening on RTL - it
 doesn't seem like the sort of thing for which RTL expansion should be
 expected to introduce new folding opportunities, and if it does I'd
 tend to think that indicates a deficiency in the GIMPLE optimizers.)

 Bootstrapped with no regressions on x86_64-unknown-linux-gnu.  OK to
 commit?

Ok.

Thanks,
Richard.

 2013-11-20  Joseph Myers  jos...@codesourcery.com

 * real.c (real_sqrt): Remove function.
 * real.h (real_sqrt): Remove prototype.
 * simplify-rtx.c (simplify_const_unary_operation): Do not fold
 SQRT using real_sqrt.

 Index: gcc/real.c
 ===
 --- gcc/real.c  (revision 205119)
 +++ gcc/real.c  (working copy)
 @@ -4765,84 +4765,6 @@ const struct real_format real_internal_format =
  false
};

 -/* Calculate the square root of X in mode MODE, and store the result
 -   in R.  Return TRUE if the operation does not raise an exception.
 -   For details see High Precision Division and Square Root,
 -   Alan H. Karp and Peter Markstein, HP Lab Report 93-93-42, June
 -   1993.  http://www.hpl.hp.com/techreports/93/HPL-93-42.pdf.  */
 -
 -bool
 -real_sqrt (REAL_VALUE_TYPE *r, enum machine_mode mode,
 -  const REAL_VALUE_TYPE *x)
 -{
 -  static REAL_VALUE_TYPE halfthree;
 -  static bool init = false;
 -  REAL_VALUE_TYPE h, t, i;
 -  int iter, exp;
 -
 -  /* sqrt(-0.0) is -0.0.  */
 -  if (real_isnegzero (x))
 -{
 -  *r = *x;
 -  return false;
 -}
 -
 -  /* Negative arguments return NaN.  */
 -  if (real_isneg (x))
 -{
 -  get_canonical_qnan (r, 0);
 -  return false;
 -}
 -
 -  /* Infinity and NaN return themselves.  */
 -  if (!real_isfinite (x))
 -{
 -  *r = *x;
 -  return false;
 -}
 -
 -  if (!init)
 -{
 -  do_add (halfthree, dconst1, dconsthalf, 0);
 -  init = true;
 -}
 -
 -  /* Initial guess for reciprocal sqrt, i.  */
 -  exp = real_exponent (x);
 -  real_ldexp (i, dconst1, -exp/2);
 -
 -  /* Newton's iteration for reciprocal sqrt, i.  */
 -  for (iter = 0; iter  16; iter++)
 -{
 -  /* i(n+1) = i(n) * (1.5 - 0.5*i(n)*i(n)*x).  */
 -  do_multiply (t, x, i);
 -  do_multiply (h, t, i);
 -  do_multiply (t, h, dconsthalf);
 -  do_add (h, halfthree, t, 1);
 -  do_multiply (t, i, h);
 -
 -  /* Check for early convergence.  */
 -  if (iter = 6  real_identical (i, t))
 -   break;
 -
 -  /* ??? Unroll loop to avoid copying.  */
 -  i = t;
 -}
 -
 -  /* Final iteration: r = i*x + 0.5*i*x*(1.0 - i*(i*x)).  */
 -  do_multiply (t, x, i);
 -  do_multiply (h, t, i);
 -  do_add (i, dconst1, h, 1);
 -  do_multiply (h, t, i);
 -  do_multiply (i, dconsthalf, h);
 -  do_add (h, t, i, 0);
 -
 -  /* ??? We need a Tuckerman test to get the last bit.  */
 -
 -  real_convert (r, mode, h);
 -  return true;
 -}
 -
  /* Calculate X raised to the integer exponent N in mode MODE and store
 the result in R.  Return true if the result may be inexact due to
 loss of precision.  The algorithm is the classic left-to-right binary
 Index: gcc/real.h
 ===
 --- gcc/real.h  (revision 205119)
 +++ gcc/real.h  (working copy)
 @@ -461,10 +461,6 @@ bool real_can_shorten_arithmetic (enum machine_mod
  /* In tree.c: wrap up a REAL_VALUE_TYPE in a tree node.  */
  extern tree build_real (tree, REAL_VALUE_TYPE);

 -/* Calculate R as the square root of X in the given machine mode.  */
 -extern bool real_sqrt (REAL_VALUE_TYPE *, enum machine_mode,
 -  const REAL_VALUE_TYPE *);
 -
  /* Calculate R as X raised to the integer exponent N in mode MODE.  */
  extern bool real_powi (REAL_VALUE_TYPE *, enum machine_mode,
const REAL_VALUE_TYPE *, HOST_WIDE_INT);
 Index: gcc/simplify-rtx.c
 ===
 --- gcc/simplify-rtx.c  (revision 205119)
 +++ gcc/simplify-rtx.c  (working copy)
 @@ -1931,17 +1931,13 @@ simplify_const_unary_operation (enum rtx_code code
 SCALAR_FLOAT_MODE_P (mode)
 SCALAR_FLOAT_MODE_P (GET_MODE (op)))
  {
 -  REAL_VALUE_TYPE d, t;
 +  REAL_VALUE_TYPE d;
REAL_VALUE_FROM_CONST_DOUBLE (d, op);

switch (code)
 {
 case SQRT:
 - if (HONOR_SNANS (mode)  real_isnan (d))
 -   return 0;
 - real_sqrt (t, mode, d);
 - d = t;
 - break;
 + return 0;
 case ABS:
   d = real_value_abs (d);
   break;

 --
 Joseph S. Myers
 jos...@codesourcery.com


Re: [PATCH] libstdc++ testsuite cxxflags

2013-11-21 Thread Cesar Philippidis
On 11/21/13, 5:42 AM, Jonathan Wakely wrote:
 On 20 November 2013 23:57, Cesar Philippidis wrote:
 On 11/20/13, 1:46 PM, Jonathan Wakely wrote:
 On 20 November 2013 21:44, Jonathan Wakely wrote:
 On 29 October 2013 15:37, Cesar Philippidis wrote:
 This patch addresses two issues with the libstdc++ testsuite:

   * duplicate -g -O2 CXXFLAGS
   * missing -g -O2 for remote targets

 The duplicate -g -O2 flags is a result of testsuite_flags.in using
 build-time CXXFLAGS and proc libstdc++_init using the environmental
 CXXFLAGS, which defaults to its build-time value. This patch prevents
 testsuite_flags.in from using build-time CXXFLAGS.

 Certain remote targets require a minimum optimization level -O1 in order
 to pass several atomics built-in function tests. This patch ensures
 cxxflags contains -g -O2 at minimum when no other optimization flags
 are specified. The testsuite used to set those flags prior to Benjamin's
 patch to remove duplicate cxxflags here
 http://gcc.gnu.org/ml/gcc-patches/2012-03/msg01572.html.

 Is this OK for trunk? If so, please apply (I don't have commit rights).

 I think so ... although I'm not sure I've got my head round the
 effects in all cases!

 Sorry, I didn't realise gmail thought Ctrl-Enter meant send. I meant
 to ask a couple of questions about it ...

 Is removing EXTRA_CXX_FLAGS necessary too?

 I looked at it again, and it seems to be OK to leave it in there.

 For remote targets, if CXXFLAGS is set in the env can -g still end up 
 missing?

 No, but CXXFLAGS isn't necessarily set in the env. Specifically, if you
 run the testsuite without using the makefile, the CXXFLAGS may not be set.

 I revised the patch to preserve @EXTRA_CXX_FLAGS@. I also append the
 '-g' flag with '-O2', since the '-g' isn't as important in the testsuite
 as '-O2'.

 Is this patch OK? Is so, please commit it because I do not have an svn
 account.
 
 I've been playing around with this patch and CXXFLAGS further, and I'm
 not sure about it now.
 
 What harm do the duplicate flags do? If you want different flags to be
 used when running the testsuite you can set CXXFLAGS, which will come
 later on the command-line and so take precedence. However, if we
 remove -g -O2 from CXXFLAGS_config and you use CXXFLAGS=-DFOO when
 running the testsuite then after this change you won't get the same
 result, you'd have to change to use CXXFLAGS=-g -O2 -DFOO
 
 Is that really what we want?

I see your point. Well, if you want to override CXXFLAGS during testing,
it's probably better to use different environmental variable altogether
and include '-g -O2' as part of the base CXXFLAGS. The attached patch
does that with LIBSTDCXX_CXXFLAGS.

That said, I don't have a strong opinion on the matter, so if you want
to use the libstdcxx_testsuite-b.diff patch without the Makefile.in
changes, that's fine with me.

Cesar
2013-11-21  Cesar Philippidis  ce...@codesourcery.com

libstdc++-v3/
* scripts/testsuite_flags.in (cxxflags): Remove @CXXFLAGS@ since 
libstdc++.exp imports those flags via getenv.
* testsuite/lib/libstdc++.exp (libstdc++_init): Ensure that 
cxxflags contains '-g -O2' flag. Also, use env LIBSTDCXX_CXXFLAGS 
to augment cxxflags instead of env CXXFLAGS.


diff --git a/libstdc++-v3/scripts/testsuite_flags.in 
b/libstdc++-v3/scripts/testsuite_flags.in
index cf692f8..5e7ad32 100755
--- a/libstdc++-v3/scripts/testsuite_flags.in
+++ b/libstdc++-v3/scripts/testsuite_flags.in
@@ -57,7 +57,7 @@ case ${query} in
   ;;
 --cxxflags)
   CXXFLAGS_default=-D_GLIBCXX_ASSERT -fmessage-length=0
-  CXXFLAGS_config=@SECTION_FLAGS@ @CXXFLAGS@ @EXTRA_CXX_FLAGS@
+  CXXFLAGS_config=@SECTION_FLAGS@ @EXTRA_CXX_FLAGS@
   echo ${CXXFLAGS_default} ${CXXFLAGS_config} 
   ;;
 --cxxvtvflags)
diff --git a/libstdc++-v3/testsuite/lib/libstdc++.exp 
b/libstdc++-v3/testsuite/lib/libstdc++.exp
index 0dff98c..2848ca7 100644
--- a/libstdc++-v3/testsuite/lib/libstdc++.exp
+++ b/libstdc++-v3/testsuite/lib/libstdc++.exp
@@ -278,8 +278,8 @@ proc libstdc++_init { testfile } {
set cc [exec sh $flags_file --build-cc]
set includes [exec sh $flags_file --build-includes]
 }
-append cxxflags  
-append cxxflags [getenv CXXFLAGS]
+append cxxflags  -g -O2 
+append cxxflags [getenv LIBSTDCXX_CXXFLAGS]
 v3track cxxflags 2
 
 # Always use MO files built by this test harness.


Re: Enale -fno-fat-lto-objects by default

2013-11-21 Thread Florian Weimer

On 11/19/2013 02:54 PM, Jan Hubicka wrote:


The problem is that you have .a library consisting of slim LTO objects and you 
link
with it during configure check without -flto.


On the other hand, many configure checks will never work reliably with 
-flto because they rely heavily on the fact that GCC cannot see across 
translation units and spot crass declaration/definition mismatches.


--
Florian Weimer / Red Hat Product Security Team


Re: [RFA/RFC patch]: Follow-up on type-demotion pass ...

2013-11-21 Thread Kai Tietz
- Original Message -
 1) Can you please split out the forwprop pieces and motivate them
 separately?

Ok, done.  The factored out forward-propagation part is nevertheless pretty 
strong bound to this patch. And it shares some testsuite adjustment due this.

 2) How did you decide on pass placement?  It looks like you do
 it twice (eh?) and before forwprop passes.  What's the difference
 between the passes?

I added some comment to the description part for those two passes in attached 
patch.  Btw in that description there was already some description on the cause 
of the ordering.

The issue is that the type-demotion pass does transformation in 
oppose-direction as forward-propagation pass does.  So by running 
forward-propagation after type-demotion pass we make sure that we always need 
just to handle with the normal form of those expressions.
Tests have shown that it isn't much beneficial to run this type-demotion pass 
more often.  This is mainly caused by the fact that no more new 
type-sinking/raising optimization are likely to be done.

The difference between first and second run is, that in first run we don't do 
for +/- any unsigned-type transition optimization.  We need to delay this pass 
after first range-analysis for PLUS/MINUS expressions are done.  Actual we 
could do same optimization for MULT_EPXR too, but due pretty late possible 
overflow-checking of those, we can't perform it.

 3) You don't add a flag to disable this pass, please add one
Add it to attached patch.

 4) You compute post-dominators but appearantly only to specify
 basic-block walk order - instead use one of the basic-block
 odering functions
I rewrote this code. I didn't noticed much speed-improvements by this, but 
anyway it might be more clean that way.
 
 Richard.

Hello,


ChangeLog gcc

* Makefile.in: Add tree-ssa-te.c to build.
* passes.def: Add typedemote passes.
* tree-pass.h (make_pass_demote1): Add prototype.
(make_pass_demote2): Likewise.
* common.opt (ftree-typedemote): Add new option.
* tree-ssa-forwprop.c (simplify_shift):  New function.
(ssa_forward_propagate_and_combine): Use it.
* tree-ssa-te.c: New pass.

Changelog gcc/testsuite:

* gcc.dg/tree-ssa/scev-cast.c: Adjust test.
* gcc.dg/tree-ssa/ssa-fre-2.c: Likewise.
* gcc.dg/tree-ssa/ssa-fre-3.c: Likewise.
* gcc.dg/tree-ssa/ssa-fre-4.c: Likewise.
* gcc.dg/tree-ssa/ssa-fre-5.c: Likewise.
* gcc.dg/tree-ssa/ts-add-1.c: New test.
* gcc.dg/tree-ssa/ts-add-2.c: New test.
* gcc.dg/tree-ssa/ts-add-3.c: New test.
* gcc.dg/tree-ssa/ts-shift-1.c: New test.


Shared testsuite part between type-demotion and forward-propagation part.

Changelog gcc/testsuite:

* gcc.dg/vect/vect-over-widen-1-big-array.c: Likewise.
* gcc.dg/vect/vect-over-widen-1.c: Likewise.
* gcc.dg/vect/vect-over-widen-3-big-array.c: Likewise.
* gcc.dg/vect/vect-over-widen-3.c: Likewise.
* gcc.dg/vect/vect-over-widen-4-big-array.c: Likewise.
* gcc.dg/vect/vect-over-widen-4.c: Likewise.

Test for x86_64-unknown-linux-gnu, and i686-pc-cygwin, and x86_64-pc-cygwin.

Ok for apply?

Regards,
Kai


Index: gcc-trunk/gcc/Makefile.in
===
--- gcc-trunk.orig/gcc/Makefile.in
+++ gcc-trunk/gcc/Makefile.in
@@ -1430,6 +1430,7 @@ OBJS = \
tree-ssa-pre.o \
tree-ssa-propagate.o \
tree-ssa-reassoc.o \
+   tree-ssa-te.o \
tree-ssa-sccvn.o \
tree-ssa-sink.o \
tree-ssa-strlen.o \
Index: gcc-trunk/gcc/passes.def
===
--- gcc-trunk.orig/gcc/passes.def
+++ gcc-trunk/gcc/passes.def
@@ -65,6 +65,7 @@ along with GCC; see the file COPYING3.
  NEXT_PASS (pass_remove_cgraph_callee_edges);
  NEXT_PASS (pass_rename_ssa_copies);
  NEXT_PASS (pass_ccp);
+ NEXT_PASS (pass_demote1);
  /* After CCP we rewrite no longer addressed locals into SSA
 form if possible.  */
  NEXT_PASS (pass_forwprop);
@@ -137,6 +138,7 @@ along with GCC; see the file COPYING3.
   /* After CCP we rewrite no longer addressed locals into SSA
 form if possible.  */
   NEXT_PASS (pass_phiprop);
+  NEXT_PASS (pass_demote2);
   NEXT_PASS (pass_forwprop);
   NEXT_PASS (pass_object_sizes);
   /* pass_build_alias is a dummy pass that ensures that we
Index: gcc-trunk/gcc/tree-pass.h
===
--- gcc-trunk.orig/gcc/tree-pass.h
+++ gcc-trunk/gcc/tree-pass.h
@@ -430,6 +430,8 @@ extern gimple_opt_pass *make_pass_vrp (g
 extern gimple_opt_pass *make_pass_uncprop (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_return_slot (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_reassoc (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_demote1 

[PATCH] Make the IRA shrink-wrapping preparation also work on ppc64

2013-11-21 Thread Martin Jambor
Hi,

the patch below enables IRA live-range splitting that later
facilitates shrink-wrapping also work on ppc64.  The difference is
that while on x86_64 it was enough to look for single sets from a hard
register to a pseudo in the first BB, on ppc the instructions are more
complicated and can look like this (example from pr10474.c testcase):

(insn 6 3 7 2 (parallel [
(set (reg:CC 124)
(compare:CC (reg:DI 3 3 [ i ])
(const_int 0 [0])))
(set (reg/v/f:DI 123 [ i ])
(reg:DI 3 3 [ i ]))
]) pr10474.c:6 428 {*movdi_internal2}
 (expr_list:REG_DEAD (reg:DI 3 3 [ i ])
(nil)))

So I changed the code that determines whether an instruction is
interesting or not to also go through a parallel instructions and be
happy with them if there is exactly one interesting SET part.

Unfortunately, I also had to change two testcases that check this to
use long instead of int, otherwise I get SUBREG uses that make
split_live_ranges_for_shrink_wrap give up.  I will try to get rid of
that limitation (and have other ideas for improvement as well) but
that is something for 4.10.  It works for pointers (e.g. in pr10474.c)
and that is probably most important.

Bootstrapped and tested on ppc64-linux (all languages), x86_64-linux
(all languages + Ada and ObjC++) and on ia64-linux (C, C++ and
Fortran).  OK for trunk?

Thanks,

Martin


2013-11-20  Martin Jambor  mjam...@suse.cz

PR rtl-optimization/10474
* ira.c (interesting_dest_for_shprep_1): New function.
(interesting_dest_for_shprep): Use interesting_dest_for_shprep_1,
also check parallels.

testsuite/
* gcc.dg/pr10474.c: Also test ppc64.
* gcc.dg/ira-shrinkwrap-prep-1.c: Also tes ppc64, changed all ints
to longs.
* gcc.dg/ira-shrinkwrap-prep-2.c: Likewise.

diff --git a/gcc/ira.c b/gcc/ira.c
index f4fdb11..34d9649 100644
--- a/gcc/ira.c
+++ b/gcc/ira.c
@@ -4847,17 +4847,13 @@ find_moveable_pseudos (void)
   free_dominance_info (CDI_DOMINATORS);
 }
 
-
-/* If insn is interesting for parameter range-splitting shring-wrapping
-   preparation, i.e. it is a single set from a hard register to a pseudo, which
-   is live at CALL_DOM, return the destination.  Otherwise return NULL.  */
+/* If SET pattern SET is an assignment from a hard register to a pseudo which
+   is live at CALL_DOM (if non-NULL, otherwise this check is omitted), return
+   the destination.  Otherwise return NULL.  */
 
 static rtx
-interesting_dest_for_shprep (rtx insn, basic_block call_dom)
+interesting_dest_for_shprep_1 (rtx set, basic_block call_dom)
 {
-  rtx set = single_set (insn);
-  if (!set)
-return NULL;
   rtx src = SET_SRC (set);
   rtx dest = SET_DEST (set);
   if (!REG_P (src) || !HARD_REGISTER_P (src)
@@ -4867,6 +4863,41 @@ interesting_dest_for_shprep (rtx insn, basic_block 
call_dom)
   return dest;
 }
 
+/* If insn is interesting for parameter range-splitting shring-wrapping
+   preparation, i.e. it is a single set from a hard register to a pseudo, which
+   is live at CALL_DOM (if non-NULL, otherwise this check is omitted), or a
+   parallel statement with only one such statement, return the destination.
+   Otherwise return NULL.  */
+
+static rtx
+interesting_dest_for_shprep (rtx insn, basic_block call_dom)
+{
+  if (!INSN_P (insn))
+return NULL;
+  rtx pat = PATTERN (insn);
+  if (GET_CODE (pat) == SET)
+return interesting_dest_for_shprep_1 (pat, call_dom);
+
+  if (GET_CODE (pat) != PARALLEL)
+return NULL;
+  rtx ret = NULL;
+  for (int i = 0; i  XVECLEN (pat, 0); i++)
+{
+  rtx sub = XVECEXP (pat, 0, i);
+  if (GET_CODE (sub) == USE || GET_CODE (sub) == CLOBBER)
+   continue;
+  if (GET_CODE (sub) != SET
+ || side_effects_p (sub))
+   return NULL;
+  rtx dest = interesting_dest_for_shprep_1 (sub, call_dom);
+  if (dest  ret)
+   return NULL;
+  if (dest)
+   ret = dest;
+}
+  return ret;
+}
+
 /* Split live ranges of pseudos that are loaded from hard registers in the
first BB in a BB that dominates all non-sibling call if such a BB can be
found and is not in a loop.  Return true if the function has made any
diff --git a/gcc/testsuite/gcc.dg/ira-shrinkwrap-prep-1.c 
b/gcc/testsuite/gcc.dg/ira-shrinkwrap-prep-1.c
index 4fc00b2..54d3e76 100644
--- a/gcc/testsuite/gcc.dg/ira-shrinkwrap-prep-1.c
+++ b/gcc/testsuite/gcc.dg/ira-shrinkwrap-prep-1.c
@@ -1,18 +1,18 @@
-/* { dg-do compile { target { x86_64-*-*  lp64 } } } */
+/* { dg-do compile { target { { x86_64-*-*  lp64 } || { powerpc*-*-*  lp64 
} } } } */
 /* { dg-options -O3 -fdump-rtl-ira -fdump-rtl-pro_and_epilogue  } */
 
-int __attribute__((noinline, noclone))
-foo (int a)
+long __attribute__((noinline, noclone))
+foo (long a)
 {
   return a + 5;
 }
 
-static int g;
+static long g;
 
-int __attribute__((noinline, noclone))
-bar (int a)
+long __attribute__((noinline, noclone))
+bar (long a)
 {
-  int r;
+  long 

[PATCH][2/3] Fix PR59058

2013-11-21 Thread Richard Biener

This removes the broken function from tree-scalar-evolution.c and
re-implements it inside the now single user (but unfixed).  It
also re-shuffles the vectorizer niter code some more to make
the final fix (use # of latch executions throughout) more easy.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.

Richard.

2013-11-21  Richard Biener  rguent...@suse.de

PR tree-optimization/59058
* tree-scalar-evolution.h (number_of_exit_cond_executions): Remove.
* tree-scalar-evolution.c (number_of_exit_cond_executions): Likewise.
* tree-vectorizer.h (LOOP_PEELING_FOR_ALIGNMENT): Rename to ...
(LOOP_VINFO_PEELING_FOR_ALIGNMENT): ... this.
(NITERS_KNOWN_P): Fold into ...
(LOOP_VINFO_NITERS_KNOWN_P): ... this.
(LOOP_VINFO_PEELING_FOR_NITER): Add.
* tree-vect-loop-manip.c (vect_gen_niters_for_prolog_loop):
Use LOOP_VINFO_PEELING_FOR_ALIGNMENT.
(vect_do_peeling_for_alignment): Re-use precomputed niter
instead of re-emitting it.
* tree-vect-data-refs.c (vect_enhance_data_refs_alignment):
Use LOOP_VINFO_PEELING_FOR_ALIGNMENT.
* tree-vect-loop.c (vect_get_loop_niters): Use
number_of_latch_executions.
(new_loop_vec_info): Initialize LOOP_VINFO_PEELING_FOR_NITER.
(vect_analyze_loop_form): Simplify.
(vect_analyze_loop_operations): Move epilogue peeling code ...
(vect_analyze_loop_2): ... here and adjust it to compute
LOOP_VINFO_PEELING_FOR_NITER.
(vect_estimate_min_profitable_iters): Use
LOOP_VINFO_PEELING_FOR_ALIGNMENT.
(vect_build_loop_niters): Emit on the preheader.
(vect_generate_tmps_on_preheader): Likewise.
(vect_transform_loop): Use LOOP_VINFO_PEELING_FOR_NITER instead
of recomputing it.  Adjust.

Index: gcc/tree-vect-loop-manip.c
===
*** gcc/tree-vect-loop-manip.c.orig 2013-11-21 14:58:43.061653802 +0100
--- gcc/tree-vect-loop-manip.c  2013-11-21 14:58:51.151747654 +0100
*** vect_gen_niters_for_prolog_loop (loop_ve
*** 1736,1751 
  
pe = loop_preheader_edge (loop);
  
!   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo)  0)
  {
!   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
  
if (dump_enabled_p ())
  dump_printf_loc (MSG_NOTE, vect_location,
   known peeling = %d.\n, npeel);
  
iters = build_int_cst (niters_type, npeel);
!   *bound = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
  }
else
  {
--- 1736,1751 
  
pe = loop_preheader_edge (loop);
  
!   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)  0)
  {
!   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
  
if (dump_enabled_p ())
  dump_printf_loc (MSG_NOTE, vect_location,
   known peeling = %d.\n, npeel);
  
iters = build_int_cst (niters_type, npeel);
!   *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
  }
else
  {
*** vect_do_peeling_for_alignment (loop_vec_
*** 1876,1882 
  {
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
tree niters_of_prolog_loop;
-   tree n_iters;
tree wide_prolog_niters;
struct loop *new_loop;
int max_iter;
--- 1876,1881 
*** vect_do_peeling_for_alignment (loop_vec_
*** 1918,1926 
 loop to %d\n, max_iter);
  
/* Update number of times loop executes.  */
-   n_iters = LOOP_VINFO_NITERS (loop_vinfo);
LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
!   TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
  
if (types_compatible_p (sizetype, TREE_TYPE (niters_of_prolog_loop)))
  wide_prolog_niters = niters_of_prolog_loop;
--- 1917,1924 
 loop to %d\n, max_iter);
  
/* Update number of times loop executes.  */
LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
!   TREE_TYPE (ni_name), ni_name, niters_of_prolog_loop);
  
if (types_compatible_p (sizetype, TREE_TYPE (niters_of_prolog_loop)))
  wide_prolog_niters = niters_of_prolog_loop;
Index: gcc/tree-vectorizer.h
===
*** gcc/tree-vectorizer.h.orig  2013-11-21 14:58:43.062653811 +0100
--- gcc/tree-vectorizer.h   2013-11-21 14:58:51.153747678 +0100
*** typedef struct _loop_vec_info {
*** 361,367 
  #define LOOP_VINFO_DATAREFS(L) (L)-datarefs
  #define LOOP_VINFO_DDRS(L) (L)-ddrs
  #define LOOP_VINFO_INT_NITERS(L)   (TREE_INT_CST_LOW ((L)-num_iters))
! #define LOOP_PEELING_FOR_ALIGNMENT(L)  (L)-peeling_for_alignment
  #define LOOP_VINFO_UNALIGNED_DR(L) (L)-unaligned_dr
  #define LOOP_VINFO_MAY_MISALIGN_STMTS(L)   (L)-may_misalign_stmts
  #define LOOP_VINFO_MAY_ALIAS_DDRS(L)   (L)-may_alias_ddrs
--- 361,367 
  #define 

Re: [patch 1/3] Flatten gimple.h

2013-11-21 Thread Eric Botcazou
 I moved recalculate_side_effects() from gimple.c to gimplify.c.
 gimplify.c was the primary consumer, and the only other caller was in
 the ada front end.  By moving it, the ada front end doesn't need
 gimple.h any more.

Let's eliminate the only use in the Ada front end then, we probably just 
need to propagate TREE_SIDE_EFFECTS from gnu_loop_stmt to gnu_cond_expr.

-- 
Eric Botcazou


[Patch 1/4] Conform vector implementation to ABI - Loads and Stores.

2013-11-21 Thread Tejas Belagod

Hi,

The attached patch fixes the movmode standard pattern name for ABI conformance 
for vector modes.


Tested for aarch64-none-elf, aarch64_be-none-elf. OK for trunk?

Thanks,
Tejas Belagod
ARM.

Changelog:

2013-11-21  Tejas Belagod  tejas.bela...@arm.com

gcc/
* config/aarch64/aarch64-simd.md (*aarch64_simd_movmode): Fix loads
and stores to be ABI compliant.diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index b9ebdf5..19ef203 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -85,17 +85,17 @@
 
 (define_insn *aarch64_simd_movmode
   [(set (match_operand:VD 0 aarch64_simd_nonimmediate_operand
-   =w, Utv,  w, ?r, ?w, ?r, w)
+   =w, m,  w, ?r, ?w, ?r, w)
(match_operand:VD 1 aarch64_simd_general_operand
-   Utv,  w,  w,  w,  r,  r, Dn))]
+   m,  w,  w,  w,  r,  r, Dn))]
   TARGET_SIMD
 (register_operand (operands[0], MODEmode)
|| register_operand (operands[1], MODEmode))
 {
switch (which_alternative)
  {
- case 0: return ld1\t{%0.Vtype}, %1;
- case 1: return st1\t{%1.Vtype}, %0;
+ case 0: return ldr\\t%d0, %1;
+ case 1: return str\\t%d1, %0;
  case 2: return orr\t%0.Vbtype, %1.Vbtype, %1.Vbtype;
  case 3: return umov\t%0, %1.d[0];
  case 4: return ins\t%0.d[0], %1;
@@ -113,9 +113,9 @@
 
 (define_insn *aarch64_simd_movmode
   [(set (match_operand:VQ 0 aarch64_simd_nonimmediate_operand
-   =w, Utv,  w, ?r, ?w, ?r, w)
+   =w, m,  w, ?r, ?w, ?r, w)
(match_operand:VQ 1 aarch64_simd_general_operand
-   Utv,  w,  w,  w,  r,  r, Dn))]
+   m,  w,  w,  w,  r,  r, Dn))]
   TARGET_SIMD
 (register_operand (operands[0], MODEmode)
|| register_operand (operands[1], MODEmode))
@@ -123,9 +123,9 @@
   switch (which_alternative)
 {
 case 0:
-   return ld1\t{%0.Vtype}, %1;
+   return ldr\\t%q0, %1;
 case 1:
-   return st1\t{%1.Vtype}, %0;
+   return str\\t%q1, %0;
 case 2:
return orr\t%0.Vbtype, %1.Vbtype, %1.Vbtype;
 case 3:

Re: [fortran, patch] Add Fortran 2003 IEEE intrinsic modules

2013-11-21 Thread N.M. Maclaren

On Nov 21 2013, Joseph S. Myers wrote:

On Thu, 21 Nov 2013, FX wrote:

Indeed, 387/SSE has flush-to-zero modes. But other APIs do not (glibc, 
SysV, AIX).


Note that glibc libm functions may not work when called in a flush-to-zero 
mode, only in modes that can be established by the fenv.h functions.


Well, that's two clear bugs :-(

If, as you say, they may not work at all in combination with -ffast-math,
that's one.

Setting __STDC_IEC_559__ to 1 in combination with -ffast-math is another,
given that C99 and C11 reference ISO/IEC 10559 (1989).


Regards,
Nick Maclaren.



Re: Overhaul middle-end handling of restrict

2013-11-21 Thread Xinliang David Li
On Thu, Nov 21, 2013 at 10:03 AM, Michael Matz m...@suse.de wrote:
 Hello,

 after much pondering about the issue we came up with this design to
 handle restrict more generally.  Without a completely different way of
 representing conflicts (or non-conflicts) of memory references we're bound
 to somehow encode the necessary information into the points-to set (or in
 any case information related to pointer ssa names).  This in turn means
 that the location sensitive nature of restrict needs to be made explicit
 in the IL, i.e. we basically need some starting point when a pointer
 becomes restrict (ADD_RESTRICT), and also an ending point (DEL_RESTRICT),
 which must act as barrier for other memory accesses (if it wouldn't some
 transformations could move references from outside restrict regions into
 the restrict region making them disasmbiguable with the restrict
 references, introducing a bug).


Can you use block/scope information to address this ? References from
enclosing scopes can be considered possible aliases.

David

 We also need to use our points-to solver to propagate restrict tags
 conservatively, postprocessing the information to remove too conservative
 estimates afterwards per SSA name.

 And if we want to use restrict based disambiguation we also need to
 transfer the barriers into RTL barriers (I'm using an asm with an explicit
 MEM to refer to the pointer in question, so not all memory is clobbered).
 There's some potential for improvement here by removing useless
 ADD_RESTRICT / DEL_RESTRICT pairs.

 There's another improvement when enlargening the set of SSA names to be
 considered for postprocessin.  Right now only the result of ADD_RESTRICT
 assigns are handled, that can be improved to also process SSA names that
 trivial depend on such (i.e. are offsetted and themself restrict typed).

 That facility is used to implement restrict parameters to functions,
 replacing the current ad-hoc way in the points-to solver.  Other uses
 should be done under control of the frontends, as only those know the
 semantics for real.

 I have a patch that more aggressively creates ADD_RESTRICT/DEL_RESTRICT
 pairs (basically whenever there's an assignment from non-restrict pointers
 to a restrict pointer, on the grounds that this invents a new restrict
 set), but that breaks C programs that we don't want to break (I consider
 them invalid, but there's disagreement).

 Some older incarnations of this were bootstrapped, but this specific patch
 is only now in regstrapping on x86_64-linux.  Okay for trunk if that
 passes?






 Ciao,
 Michael.
 ---
 * tree.def (ADD_RESTRICT): New tree code.
 * cfgexpand.c (expand_debug_expr): Handle it.
 * expr.c (expand_pointer_clobber): New function.
 (expand_expr_real_2): Use it to handle ADD_RESTRICT.
 * expr.h (expand_pointer_clobber): Declare.
 * function.c (gimplify_parameters): Return a second gimple_seq,
 handle restrict parameters.
 * function.h (gimplify_parameters): Adjust.
 * gimple-pretty-print.c (dump_binary_rhs): Handle ADD_RESTRICT.
 * gimplify.c (gimplify_body): Append second gimple_seq,
 adjust call to gimplify_parameters.
 * internal-fn.def (DEL_RESTRICT): New internal function code.
 * internal-fn.c (expand_DEL_RESTRICT): New function.
 * tree-cfg.c (verify_gimple_assign_binary): Check ADD_RESTRICT.
 * tree-inline.c (estimate_operator_cost): Handle ADD_RESTRICT.
 * tree-pretty-print.c (dump_generic_node): Ditto.
 * tree-ssa-dce.c (propagate_necessity): DEL_RESTRICT calls
 are only clobbers.
 * tree-ssa-structalias.c (build_fake_var_decl_uid): New static
 function.
 (build_fake_var_decl): Rewrite in terms of above.
 (make_heapvar): Take uid parameter.
 (make_constraint_from_restrict_uid): New.
 (make_constraint_from_restrict): Use above.
 (make_constraint_from_global_restrict): Explicitely set global flag.
 (handle_lhs_call): Adjust call to make_heapvar.
 (find_func_aliases_for_internal_call): New.
 (find_func_aliases_for_call): Use it.
 (find_func_aliases): Handle ADD_RESTRICT.
 (intra_create_variable_infos): Remove any explicit handling
 of restrict parameters.
 (set_uids_in_ptset): Update instead of overwrite
 vars_contains_escaped_heap flag.
 (find_what_var_points_to_1): Renamed from ...
 (find_what_var_points_to): ... this, which is now wrapper
 postprocessing points-to flags.
 (compute_points_to_sets): Ignore DEL_RESTRICT calls.

 Index: cfgexpand.c
 ===
 --- cfgexpand.c (revision 205123)
 +++ cfgexpand.c (working copy)
 @@ -3785,6 +3785,7 @@ expand_debug_expr (tree exp)
/* Fall through.  */

  adjust_mode:
 +case ADD_RESTRICT:
  case PAREN_EXPR:
 

[PATCH] Missing __divtf3@@GCC_4.4.0 in libgcc on ia64

2013-11-21 Thread Andreas Schwab
Since there is already the __divtf3@GCC_3.0 compatibility alias in
libgcc we need to attach an explicit symbol version to the real __divtf3
in order to get it exported.  This fixes the unversioned reference in
libgfortran.so, and fixes the failure of gfortran.dg/erf_3.F90.  Tested
on ia64-suse-linux.

Andreas.

PR target/59227
PR target/59230
* config/ia64/t-softfp-compat (softfp_file_list): Filter out
soft-fp/divtf3.c.
(LIB2ADD): Add config/ia64/divtf3.c.
* config/ia64/divtf3.c: New file.

diff --git a/libgcc/config/ia64/divtf3.c b/libgcc/config/ia64/divtf3.c
new file mode 100644
index 000..e1afa29
--- /dev/null
+++ b/libgcc/config/ia64/divtf3.c
@@ -0,0 +1,9 @@
+#ifdef SHARED
+#define __divtf3 __divtf3_shared
+#endif
+
+#include soft-fp/divtf3.c
+
+#ifdef SHARED
+asm (.symver __divtf3_shared, __divtf3@@GCC_4.4.0);
+#endif
diff --git a/libgcc/config/ia64/t-softfp-compat 
b/libgcc/config/ia64/t-softfp-compat
index 00f45d5..38bcea7 100644
--- a/libgcc/config/ia64/t-softfp-compat
+++ b/libgcc/config/ia64/t-softfp-compat
@@ -5,3 +5,6 @@ libgcc1-tf-functions = __divxf3  _fixtfdi _fixunstfdi _floatditf
 LIB1ASMFUNCS := $(filter-out $(libgcc1-tf-functions), $(LIB1ASMFUNCS))
 libgcc1-tf-compats = $(addsuffix .S, $(libgcc1-tf-functions))
 LIB2ADD += $(addprefix $(srcdir)/config/ia64/, $(libgcc1-tf-compats))
+# Wrap divtf3.c to set the default symbol version
+softfp_file_list := $(filter-out $(srcdir)/soft-fp/divtf3.c, 
$(softfp_file_list))
+LIB2ADD += $(srcdir)/config/ia64/divtf3.c
-- 
1.8.4.3

-- 
Andreas Schwab, SUSE Labs, sch...@suse.de
GPG Key fingerprint = 0196 BAD8 1CE9 1970 F4BE  1748 E4D4 88E3 0EEA B9D7
And now for something completely different.


  1   2   >