Re: [PATCH] graphite: Fix non-INTEGER_TYPE integral comparison handling [PR114041]

2024-02-28 Thread Richard Biener
On Wed, 28 Feb 2024, Jakub Jelinek wrote:

> Hi!
> 
> The following testcases are miscompiled, because graphite ignores boolean,
> enumerated or _BitInt comparisons, rewrites the code as if the comparisons
> were always true or always false.
> 
> The INTEGER_TYPE checks were initially added in r6-2239 but at that point
> it was both in add_conditions_to_domain and in parameter_index_in_region.
> Later on the check was also added to stmt_simple_for_scop_p, and finally
> r8-3931 changed the stmt_simple_for_scop_p check to INTEGRAL_TYPE_P
> and turned the parameter_index_in_region -> assign_parameter_index_in_region
> into INTEGRAL_TYPE_P assertion, but the add_conditions_to_domain check
> for INTEGER_TYPE remained.
> 
> The following patch uses INTEGRAL_TYPE_P to complete the change.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux (--with-isl only
> on the former though), ok for trunk?

OK.  As said this should probably be an assert, but I won't bother you
with that (and the possible fallout).

Thanks,
Richard.

> 2024-02-28  Jakub Jelinek  
> 
>   PR tree-optimization/114041
>   * graphite-sese-to-poly.cc (add_conditions_to_domain): Check for
>   INTEGRAL_TYPE_P check rather than INTEGER_TYPE.
> 
>   * gcc.dg/graphite/run-id-pr114041-1.c: New test.
>   * gcc.dg/graphite/run-id-pr114041-2.c: New test.
> 
> --- gcc/graphite-sese-to-poly.cc.jj   2024-01-03 11:51:29.136764430 +0100
> +++ gcc/graphite-sese-to-poly.cc  2024-02-27 19:35:07.668304435 +0100
> @@ -391,8 +391,9 @@ add_conditions_to_domain (poly_bb_p pbb)
>{
>case GIMPLE_COND:
> {
> -/* Don't constrain on anything else than INTEGER_TYPE.  */
> - if (TREE_CODE (TREE_TYPE (gimple_cond_lhs (stmt))) != INTEGER_TYPE)
> + /* Don't constrain on anything else than INTEGRAL_TYPE_P.  */
> + tree cmp_type = TREE_TYPE (gimple_cond_lhs (stmt));
> + if (!INTEGRAL_TYPE_P (cmp_type))
>break;
>  
>   gcond *cond_stmt = as_a  (stmt);
> --- gcc/testsuite/gcc.dg/graphite/run-id-pr114041-1.c.jj  2024-02-27 
> 18:42:26.864025806 +0100
> +++ gcc/testsuite/gcc.dg/graphite/run-id-pr114041-1.c 2024-02-27 
> 18:43:07.310466262 +0100
> @@ -0,0 +1,23 @@
> +/* PR tree-optimization/114041 */
> +/* { dg-require-effective-target bitint } */
> +/* { dg-options "-O -fgraphite-identity" } */
> +
> +unsigned a[24], b[24];
> +
> +__attribute__((noipa)) unsigned
> +foo (unsigned _BitInt(8) x)
> +{
> +  for (int i = 0; i < 24; ++i)
> +a[i] = i;
> +  unsigned e = __builtin_stdc_bit_ceil (x);
> +  for (int i = 0; i < 24; ++i)
> +b[i] = i;
> +  return e;
> +}
> +
> +int
> +main ()
> +{
> +  if (foo (0) != 1)
> +__builtin_abort ();
> +}
> --- gcc/testsuite/gcc.dg/graphite/run-id-pr114041-2.c.jj  2024-02-27 
> 19:36:02.373547881 +0100
> +++ gcc/testsuite/gcc.dg/graphite/run-id-pr114041-2.c 2024-02-27 
> 19:36:22.515269333 +0100
> @@ -0,0 +1,27 @@
> +/* PR tree-optimization/114041 */
> +/* { dg-options "-O -fgraphite-identity" } */
> +
> +unsigned a[24], b[24];
> +enum E { E0 = 0, E1 = 1, E42 = 42, E56 = 56 };
> +
> +__attribute__((noipa)) unsigned
> +foo (enum E x)
> +{
> +  for (int i = 0; i < 24; ++i)
> +a[i] = i;
> +  unsigned e;
> +  if (x >= E42)
> +e = __builtin_clz ((unsigned) x);
> +  else
> +e = 42;
> +  for (int i = 0; i < 24; ++i)
> +b[i] = i;
> +  return e;
> +}
> +
> +int
> +main ()
> +{
> +  if (foo (E1) != 42 || foo (E56) != __SIZEOF_INT__ * __CHAR_BIT__ - 6)
> +__builtin_abort ();
> +}
> 
>   Jakub
> 
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)


[PATCH] graphite: Fix non-INTEGER_TYPE integral comparison handling [PR114041]

2024-02-28 Thread Jakub Jelinek
Hi!

The following testcases are miscompiled, because graphite ignores boolean,
enumerated or _BitInt comparisons, rewrites the code as if the comparisons
were always true or always false.

The INTEGER_TYPE checks were initially added in r6-2239 but at that point
it was both in add_conditions_to_domain and in parameter_index_in_region.
Later on the check was also added to stmt_simple_for_scop_p, and finally
r8-3931 changed the stmt_simple_for_scop_p check to INTEGRAL_TYPE_P
and turned the parameter_index_in_region -> assign_parameter_index_in_region
into INTEGRAL_TYPE_P assertion, but the add_conditions_to_domain check
for INTEGER_TYPE remained.

The following patch uses INTEGRAL_TYPE_P to complete the change.

Bootstrapped/regtested on x86_64-linux and i686-linux (--with-isl only
on the former though), ok for trunk?

2024-02-28  Jakub Jelinek  

PR tree-optimization/114041
* graphite-sese-to-poly.cc (add_conditions_to_domain): Check for
INTEGRAL_TYPE_P check rather than INTEGER_TYPE.

* gcc.dg/graphite/run-id-pr114041-1.c: New test.
* gcc.dg/graphite/run-id-pr114041-2.c: New test.

--- gcc/graphite-sese-to-poly.cc.jj 2024-01-03 11:51:29.136764430 +0100
+++ gcc/graphite-sese-to-poly.cc2024-02-27 19:35:07.668304435 +0100
@@ -391,8 +391,9 @@ add_conditions_to_domain (poly_bb_p pbb)
   {
   case GIMPLE_COND:
  {
-/* Don't constrain on anything else than INTEGER_TYPE.  */
-   if (TREE_CODE (TREE_TYPE (gimple_cond_lhs (stmt))) != INTEGER_TYPE)
+   /* Don't constrain on anything else than INTEGRAL_TYPE_P.  */
+   tree cmp_type = TREE_TYPE (gimple_cond_lhs (stmt));
+   if (!INTEGRAL_TYPE_P (cmp_type))
   break;
 
gcond *cond_stmt = as_a  (stmt);
--- gcc/testsuite/gcc.dg/graphite/run-id-pr114041-1.c.jj2024-02-27 
18:42:26.864025806 +0100
+++ gcc/testsuite/gcc.dg/graphite/run-id-pr114041-1.c   2024-02-27 
18:43:07.310466262 +0100
@@ -0,0 +1,23 @@
+/* PR tree-optimization/114041 */
+/* { dg-require-effective-target bitint } */
+/* { dg-options "-O -fgraphite-identity" } */
+
+unsigned a[24], b[24];
+
+__attribute__((noipa)) unsigned
+foo (unsigned _BitInt(8) x)
+{
+  for (int i = 0; i < 24; ++i)
+a[i] = i;
+  unsigned e = __builtin_stdc_bit_ceil (x);
+  for (int i = 0; i < 24; ++i)
+b[i] = i;
+  return e;
+}
+
+int
+main ()
+{
+  if (foo (0) != 1)
+__builtin_abort ();
+}
--- gcc/testsuite/gcc.dg/graphite/run-id-pr114041-2.c.jj2024-02-27 
19:36:02.373547881 +0100
+++ gcc/testsuite/gcc.dg/graphite/run-id-pr114041-2.c   2024-02-27 
19:36:22.515269333 +0100
@@ -0,0 +1,27 @@
+/* PR tree-optimization/114041 */
+/* { dg-options "-O -fgraphite-identity" } */
+
+unsigned a[24], b[24];
+enum E { E0 = 0, E1 = 1, E42 = 42, E56 = 56 };
+
+__attribute__((noipa)) unsigned
+foo (enum E x)
+{
+  for (int i = 0; i < 24; ++i)
+a[i] = i;
+  unsigned e;
+  if (x >= E42)
+e = __builtin_clz ((unsigned) x);
+  else
+e = 42;
+  for (int i = 0; i < 24; ++i)
+b[i] = i;
+  return e;
+}
+
+int
+main ()
+{
+  if (foo (E1) != 42 || foo (E56) != __SIZEOF_INT__ * __CHAR_BIT__ - 6)
+__builtin_abort ();
+}

Jakub



Re: [PATCH][GRAPHITE] Fix PR71351

2017-12-19 Thread Tom de Vries

On 12/19/2017 02:05 PM, Richard Biener wrote:

On Tue, 19 Dec 2017, Tom de Vries wrote:


On 09/21/2017 12:07 PM, Richard Biener wrote:

-exit_edge = create_empty_if_region_on_edge (entry_edge,
-   unshare_expr (cond_expr));


This removes the fix for PR70045:
...
diff --git a/gcc/graphite-isl-ast-to-gimple.c
b/gcc/graphite-isl-ast-to-gimple.c
index 89a4118..8dd5dc8 100644
--- a/gcc/graphite-isl-ast-to-gimple.c
+++ b/gcc/graphite-isl-ast-to-gimple.c
@@ -821,7 +821,8 @@ graphite_create_new_loop_guard (edge entry_edge,
if (integer_onep (cond_expr))
  exit_edge = entry_edge;
else
-exit_edge = create_empty_if_region_on_edge (entry_edge, cond_expr);
+exit_edge = create_empty_if_region_on_edge (entry_edge,
+   unshare_expr (cond_expr));

return exit_edge;
  }
...


Consequently, the pr70045.c testcase is currently ICE-ing.


Sorry.



Np. Nice to see that the regression test caught it :)


Attached patch fixes this.

OK for trunk if bootstrap and reg-test on x86_64 succeed?


Ok.


Thanks,
- Tom


Re: [PATCH][GRAPHITE] Fix PR71351

2017-12-19 Thread Richard Biener
On Tue, 19 Dec 2017, Tom de Vries wrote:

> On 09/21/2017 12:07 PM, Richard Biener wrote:
> > -exit_edge = create_empty_if_region_on_edge (entry_edge,
> > -   unshare_expr (cond_expr));
> 
> This removes the fix for PR70045:
> ...
> diff --git a/gcc/graphite-isl-ast-to-gimple.c
> b/gcc/graphite-isl-ast-to-gimple.c
> index 89a4118..8dd5dc8 100644
> --- a/gcc/graphite-isl-ast-to-gimple.c
> +++ b/gcc/graphite-isl-ast-to-gimple.c
> @@ -821,7 +821,8 @@ graphite_create_new_loop_guard (edge entry_edge,
>if (integer_onep (cond_expr))
>  exit_edge = entry_edge;
>else
> -exit_edge = create_empty_if_region_on_edge (entry_edge, cond_expr);
> +exit_edge = create_empty_if_region_on_edge (entry_edge,
> + unshare_expr (cond_expr));
> 
>return exit_edge;
>  }
> ...
> 
> 
> Consequently, the pr70045.c testcase is currently ICE-ing.

Sorry.

> Attached patch fixes this.
> 
> OK for trunk if bootstrap and reg-test on x86_64 succeed?

Ok.

Richard.


Re: [PATCH][GRAPHITE] Fix PR71351

2017-12-19 Thread Tom de Vries

On 09/21/2017 12:07 PM, Richard Biener wrote:

-exit_edge = create_empty_if_region_on_edge (entry_edge,
-   unshare_expr (cond_expr));


This removes the fix for PR70045:
...
diff --git a/gcc/graphite-isl-ast-to-gimple.c 
b/gcc/graphite-isl-ast-to-gimple.c

index 89a4118..8dd5dc8 100644
--- a/gcc/graphite-isl-ast-to-gimple.c
+++ b/gcc/graphite-isl-ast-to-gimple.c
@@ -821,7 +821,8 @@ graphite_create_new_loop_guard (edge entry_edge,
   if (integer_onep (cond_expr))
 exit_edge = entry_edge;
   else
-exit_edge = create_empty_if_region_on_edge (entry_edge, cond_expr);
+exit_edge = create_empty_if_region_on_edge (entry_edge,
+   unshare_expr (cond_expr));

   return exit_edge;
 }
...


Consequently, the pr70045.c testcase is currently ICE-ing.

Attached patch fixes this.

OK for trunk if bootstrap and reg-test on x86_64 succeed?

Thanks,
- Tom
diff --git a/gcc/graphite-isl-ast-to-gimple.c b/gcc/graphite-isl-ast-to-gimple.c
index 848bfe9..b020b2d 100644
--- a/gcc/graphite-isl-ast-to-gimple.c
+++ b/gcc/graphite-isl-ast-to-gimple.c
@@ -739,10 +739,10 @@ translate_isl_ast_node_for (loop_p context_loop, __isl_keep isl_ast_node *node,
 	 as expected.  */
   tree ub_one = fold_build2 (POINTER_TYPE_P (type)
  ? POINTER_PLUS_EXPR : PLUS_EXPR,
- type, ub, one);
+ type, unshare_expr (ub), one);
   create_empty_if_region_on_edge (next_e,
   fold_build2 (LT_EXPR, boolean_type_node,
-		   lb, ub_one));
+		   unshare_expr (lb), ub_one));
   next_e = get_true_edge_from_guard_bb (next_e->dest);
 }
 


Re: isl scheduler and spatial locality (Re: [PATCH][GRAPHITE] More TLC)

2017-11-11 Thread Sven Verdoolaege
On Sun, Oct 01, 2017 at 11:58:30AM +0200, Sven Verdoolaege wrote:
> For the approach pluto is taking, you'll have to look at the source
> code, see pluto_intra_tile_optimize_band.
> For the other two approaches I mentioned above, reports will
> be made available within the next couple of weeks.

https://hal.inria.fr/hal-01628798
http://www.cs.kuleuven.be/publicaties/rapporten/cw/CW709.abs.html

skimo


[PATCH][GRAPHITE] Tame down dumping

2017-10-20 Thread Richard Biener

This tames dumping a bit and adjusts whitespacing and order of dumping.

Bootstrapped / tested on x86_64-unknown-linux-gnu, applied.

Richard.

2017-10-20  Richard Biener  

* graphite-isl-ast-to-gimple.c
(translate_isl_ast_to_gimple::graphite_copy_stmts_from_block):
Remove return value and simplify, dump copied stmt after lhs
adjustment.
(translate_isl_ast_to_gimple::translate_isl_ast_node_user):
Reduce dump verbosity.
(gsi_insert_earliest): Likewise.
(translate_isl_ast_to_gimple::copy_bb_and_scalar_dependences): Adjust.
* graphite.c (print_global_statistics): Adjust dumping.
(print_graphite_scop_statistics): Likewise.
(print_graphite_statistics): Do not dump loops here.
(graphite_transform_loops): But here.

Index: gcc/graphite-isl-ast-to-gimple.c
===
--- gcc/graphite-isl-ast-to-gimple.c(revision 253926)
+++ gcc/graphite-isl-ast-to-gimple.c(working copy)
@@ -191,7 +191,7 @@ class translate_isl_ast_to_gimple
 
   tree get_rename_from_scev (tree old_name, gimple_seq *stmts, loop_p loop,
 vec iv_map);
-  bool graphite_copy_stmts_from_block (basic_block bb, basic_block new_bb,
+  void graphite_copy_stmts_from_block (basic_block bb, basic_block new_bb,
   vec iv_map);
   edge copy_bb_and_scalar_dependences (basic_block bb, edge next_e,
   vec iv_map);
@@ -791,13 +810,12 @@ translate_isl_ast_node_user (__isl_keep
   isl_ast_expr_free (user_expr);
 
   basic_block old_bb = GBB_BB (gbb);
-  if (dump_file)
+  if (dump_file && (dump_flags & TDF_DETAILS))
 {
   fprintf (dump_file,
   "[codegen] copying from bb_%d on edge (bb_%d, bb_%d)\n",
   old_bb->index, next_e->src->index, next_e->dest->index);
   print_loops_bb (dump_file, GBB_BB (gbb), 0, 3);
-
 }
 
   next_e = copy_bb_and_scalar_dependences (old_bb, next_e, iv_map);
@@ -807,7 +825,7 @@ translate_isl_ast_node_user (__isl_keep
   if (codegen_error_p ())
 return NULL;
 
-  if (dump_file)
+  if (dump_file && (dump_flags & TDF_DETAILS))
 {
   fprintf (dump_file, "[codegen] (after copy) new basic block\n");
   print_loops_bb (dump_file, next_e->src, 0, 3);
@@ -1049,9 +1067,9 @@ gsi_insert_earliest (gimple_seq seq)
 
   if (dump_file)
{
- fprintf (dump_file, "[codegen] inserting statement: ");
+ fprintf (dump_file, "[codegen] inserting statement in BB %d: ",
+  gimple_bb (use_stmt)->index);
  print_gimple_stmt (dump_file, use_stmt, 0, TDF_VOPS | TDF_MEMSYMS);
- print_loops_bb (dump_file, gimple_bb (use_stmt), 0, 3);
}
 }
 }
@@ -1122,7 +1140,7 @@ should_copy_to_new_region (gimple *stmt,
 /* Duplicates the statements of basic block BB into basic block NEW_BB
and compute the new induction variables according to the IV_MAP.  */
 
-bool translate_isl_ast_to_gimple::
+void translate_isl_ast_to_gimple::
 graphite_copy_stmts_from_block (basic_block bb, basic_block new_bb,
vec iv_map)
 {
@@ -1139,7 +1157,6 @@ graphite_copy_stmts_from_block (basic_bl
   /* Create a new copy of STMT and duplicate STMT's virtual
 operands.  */
   gimple *copy = gimple_copy (stmt);
-  gsi_insert_after (_tgt, copy, GSI_NEW_STMT);
 
   /* Rather than not copying debug stmts we reset them.
  ???  Where we can rewrite uses without inserting new
@@ -1154,12 +1171,6 @@ graphite_copy_stmts_from_block (basic_bl
gcc_unreachable ();
}
 
-  if (dump_file)
-   {
- fprintf (dump_file, "[codegen] inserting statement: ");
- print_gimple_stmt (dump_file, copy, 0);
-   }
-
   maybe_duplicate_eh_stmt (copy, stmt);
   gimple_duplicate_stmt_histograms (cfun, copy, cfun, stmt);
 
@@ -1172,8 +1183,12 @@ graphite_copy_stmts_from_block (basic_bl
  create_new_def_for (old_name, copy, def_p);
}
 
-  if (codegen_error_p ())
-   return false;
+  gsi_insert_after (_tgt, copy, GSI_NEW_STMT);
+  if (dump_file)
+   {
+ fprintf (dump_file, "[codegen] inserting statement: ");
+ print_gimple_stmt (dump_file, copy, 0);
+   }
 
   /* For each SCEV analyzable SSA_NAME, rename their usage.  */
   ssa_op_iter iter;
@@ -1198,8 +1213,6 @@ graphite_copy_stmts_from_block (basic_bl
 
   update_stmt (copy);
 }
-
-  return true;
 }
 
 
@@ -1236,11 +1249,7 @@ copy_bb_and_scalar_dependences (basic_bl
   gsi_insert_after (_tgt, ass, GSI_NEW_STMT);
 }
 
-  if (!graphite_copy_stmts_from_block (bb, new_bb, iv_map))
-{
-  set_codegen_error ();
-  return NULL;
-}
+  graphite_copy_stmts_from_block (bb, new_bb, iv_map);
 
   /* Insert out-of SSA copies on the original BB outgoing edges.  */
   gsi_tgt = gsi_last_bb (new_bb);

[PATCH][GRAPHITE] Limit AST code generation, PR82591

2017-10-18 Thread Richard Biener

The following limits ISL operations done during optimized AST generation
as the PR shows it can take quite a bit of time.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.

Richard.

2017-10-18  Richard Biener  

PR tree-optimization/82591
* graphite.c (graphite_transform_loops): Move code gen message
printing ...
* graphite-isl-ast-to-gimple.c (graphite_regenerate_ast_isl):
Here.  Handle scop_to_isl_ast failing.
(scop_to_isl_ast): Limit the number of ISL operations.

Index: gcc/graphite.c
===
--- gcc/graphite.c  (revision 253848)
+++ gcc/graphite.c  (working copy)
@@ -378,16 +380,14 @@ graphite_transform_loops (void)
if (!apply_poly_transforms (scop))
  continue;
 
-   location_t loc = find_loop_location
- (scops[i]->scop_info->region.entry->dest->loop_father);
-
changed = true;
-   if (!graphite_regenerate_ast_isl (scop))
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, loc,
-  "loop nest not optimized, code generation error\n");
-   else
- dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
-  "loop nest optimized\n");
+   if (graphite_regenerate_ast_isl (scop))
+ {
+   location_t loc = find_loop_location
+ (scops[i]->scop_info->region.entry->dest->loop_father);
+   dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
+"loop nest optimized\n");
+ }
   }
 
   if (changed)
Index: gcc/graphite-isl-ast-to-gimple.c
===
--- gcc/graphite-isl-ast-to-gimple.c(revision 253848)
+++ gcc/graphite-isl-ast-to-gimple.c(working copy)
@@ -56,6 +56,7 @@ along with GCC; see the file COPYING3.
 #include "cfganal.h"
 #include "value-prof.h"
 #include "tree-ssa.h"
+#include "tree-vectorizer.h"
 #include "graphite.h"
 
 struct ast_build_info
@@ -1378,6 +1341,13 @@ ast_build_before_for (__isl_keep isl_ast
 __isl_give isl_ast_node *translate_isl_ast_to_gimple::
 scop_to_isl_ast (scop_p scop)
 {
+  int old_err = isl_options_get_on_error (scop->isl_context);
+  int old_max_operations = isl_ctx_get_max_operations (scop->isl_context);
+  int max_operations = PARAM_VALUE (PARAM_MAX_ISL_OPERATIONS);
+  if (max_operations)
+isl_ctx_set_max_operations (scop->isl_context, max_operations);
+  isl_options_set_on_error (scop->isl_context, ISL_ON_ERROR_CONTINUE);
+
   gcc_assert (scop->transformed_schedule);
 
   /* Set the separate option to reduce control flow overhead.  */
@@ -1396,6 +1366,27 @@ scop_to_isl_ast (scop_p scop)
   isl_ast_node *ast_isl = isl_ast_build_node_from_schedule
 (context_isl, schedule);
   isl_ast_build_free (context_isl);
+
+  isl_options_set_on_error (scop->isl_context, old_err);
+  isl_ctx_reset_operations (scop->isl_context);
+  isl_ctx_set_max_operations (scop->isl_context, old_max_operations);
+  if (isl_ctx_last_error (scop->isl_context) != isl_error_none)
+{
+  location_t loc = find_loop_location
+   (scop->scop_info->region.entry->dest->loop_father);
+  if (isl_ctx_last_error (scop->isl_context) == isl_error_quota)
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, loc,
+"loop nest not optimized, AST generation timed out "
+"after %d operations [--param max-isl-operations]\n",
+max_operations);
+  else
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, loc,
+"loop nest not optimized, ISL AST generation "
+"signalled an error\n");
+  isl_ast_node_free (ast_isl);
+  return NULL;
+}
+
   return ast_isl;
 }
 
@@ -1444,6 +1435,12 @@ graphite_regenerate_ast_isl (scop_p scop
   timevar_push (TV_GRAPHITE_CODE_GEN);
   t.add_parameters_to_ivs_params (scop, ip);
   root_node = t.scop_to_isl_ast (scop);
+  if (! root_node)
+{
+  ivs_params_clear (ip);
+  timevar_pop (TV_GRAPHITE_CODE_GEN);
+  return false;
+}
 
   if (dump_file && (dump_flags & TDF_DETAILS))
 {
@@ -1484,10 +1481,10 @@ graphite_regenerate_ast_isl (scop_p scop
 
   if (t.codegen_error_p ())
 {
-  if (dump_file)
-   fprintf (dump_file, "codegen error: "
-"reverting back to the original code.\n");
-  set_ifsese_condition (if_region, integer_zero_node);
+  location_t loc = find_loop_location
+   (scop->scop_info->region.entry->dest->loop_father);
+  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loc,
+  "loop nest not optimized, code generation error\n");
 
   /* Remove the unreachable region.  */
   remove_edge_and_dominated_blocks (if_region->true_region->region.entry);


[PATCH][GRAPHITE] More TLC

2017-10-18 Thread Richard Biener

And using range-info to constain parameters.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.

Richard.

2017-10-18  Richard Biener  

* graphite-isl-ast-to-gimple.c
(translate_isl_ast_to_gimple::set_rename): Simplify.
(translate_isl_ast_to_gimple::set_rename_for_each_def): Inline...
(graphite_copy_stmts_from_block): ... here.
(copy_bb_and_scalar_dependences): Simplify.
(add_parameters_to_ivs_params): Canonicalize.
(generate_entry_out_of_ssa_copies): Simplify.
* graphite-sese-to-poly.c (extract_affine_name): Simplify
by passing in ISL dimension.
(parameter_index_in_region_1): Rename to ...
(parameter_index_in_region): ... this.
(extract_affine): Adjust assert, pass down parameter index.
(add_param_constraints): Use range-info when available.
(build_scop_context): Adjust.
* sese.c (new_sese_info): Adjust.
(free_sese_info): Likewise.
* sese.h (bb_map_t, rename_map_t, phi_rename, init_back_edge_pair_t):
Remove unused typedefs.
(struct sese_info_t): Simplify rename_map, remove incomplete_phis.

Index: gcc/graphite-isl-ast-to-gimple.c
===
--- gcc/graphite-isl-ast-to-gimple.c(revision 253848)
+++ gcc/graphite-isl-ast-to-gimple.c(working copy)
@@ -195,7 +195,6 @@ class translate_isl_ast_to_gimple
   edge copy_bb_and_scalar_dependences (basic_block bb, edge next_e,
   vec iv_map);
   void set_rename (tree old_name, tree expr);
-  void set_rename_for_each_def (gimple *stmt);
   void gsi_insert_earliest (gimple_seq seq);
   bool codegen_error_p () const { return codegen_error; }
 
@@ -932,25 +931,12 @@ set_rename (tree old_name, tree expr)
 {
   fprintf (dump_file, "[codegen] setting rename: old_name = ");
   print_generic_expr (dump_file, old_name);
-  fprintf (dump_file, ", new_name = ");
+  fprintf (dump_file, ", new decl = ");
   print_generic_expr (dump_file, expr);
   fprintf (dump_file, "\n");
 }
-
-  if (old_name == expr)
-return;
-
-  vec  *renames = region->rename_map->get (old_name);
-
-  if (renames)
-renames->safe_push (expr);
-  else
-{
-  vec r;
-  r.create (2);
-  r.safe_push (expr);
-  region->rename_map->put (old_name, r);
-}
+  bool res = region->rename_map->put (old_name, expr);
+  gcc_assert (! res);
 }
 
 /* Return an iterator to the instructions comes last in the execution order.
@@ -1132,21 +1118,6 @@ should_copy_to_new_region (gimple *stmt,
   return true;
 }
 
-/* Create new names for all the definitions created by COPY and add replacement
-   mappings for each new name.  */
-
-void translate_isl_ast_to_gimple::
-set_rename_for_each_def (gimple *stmt)
-{
-  def_operand_p def_p;
-  ssa_op_iter op_iter;
-  FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_ALL_DEFS)
-{
-  tree old_name = DEF_FROM_PTR (def_p);
-  create_new_def_for (old_name, stmt, def_p);
-}
-}
-
 /* Duplicates the statements of basic block BB into basic block NEW_BB
and compute the new induction variables according to the IV_MAP.  */
 
@@ -1192,7 +1163,13 @@ graphite_copy_stmts_from_block (basic_bl
   gimple_duplicate_stmt_histograms (cfun, copy, cfun, stmt);
 
   /* Crete new names for each def in the copied stmt.  */
-  set_rename_for_each_def (copy);
+  def_operand_p def_p;
+  ssa_op_iter op_iter;
+  FOR_EACH_SSA_DEF_OPERAND (def_p, copy, op_iter, SSA_OP_ALL_DEFS)
+   {
+ tree old_name = DEF_FROM_PTR (def_p);
+ create_new_def_for (old_name, copy, def_p);
+   }
 
   if (codegen_error_p ())
return false;
@@ -1244,17 +1221,14 @@ copy_bb_and_scalar_dependences (basic_bl
continue;
 
   tree new_phi_def;
-  vec  *renames = region->rename_map->get (res);
-  if (! renames || renames->is_empty ())
+  tree *rename = region->rename_map->get (res);
+  if (! rename)
{
  new_phi_def = create_tmp_reg (TREE_TYPE (res));
  set_rename (res, new_phi_def);
}
   else
-   {
- gcc_assert (renames->length () == 1);
- new_phi_def = (*renames)[0];
-   }
+   new_phi_def = *rename;
 
   gassign *ass = gimple_build_assign (NULL_TREE, new_phi_def);
   create_new_def_for (res, ass, NULL);
@@ -1291,17 +1265,14 @@ copy_bb_and_scalar_dependences (basic_bl
continue;
 
  tree new_phi_def;
- vec  *renames = region->rename_map->get (res);
- if (! renames || renames->is_empty ())
+ tree *rename = region->rename_map->get (res);
+ if (! rename)
{
  new_phi_def = create_tmp_reg (TREE_TYPE (res));
  set_rename (res, new_phi_def);
}
  else
-   {
- 

[PATCH][GRAPHITE] Fix ISL memory management issue

2017-10-17 Thread Richard Biener

The isl_union_map operations always take the existing map and return
a new one but scop_get_reads_and_writes tries to operate on its
parameters in-place.  This fails once a re-allocation happens leading
to "interesting" issues (like random segfaults with 
-fdump-tree-graphite-details on larger testcases).

Fixed as follows.

Committed as obvious.

Richard.

2017-10-17  Richard Biener  

* graphite-dependences.c (scop_get_reads_and_writes): Change
output parameters to references.

Index: gcc/graphite-dependences.c
===
--- gcc/graphite-dependences.c  (revision 253811)
+++ gcc/graphite-dependences.c  (working copy)
@@ -67,9 +67,9 @@ add_pdr_constraints (poly_dr_p pdr, poly
reads are returned in READS and writes in MUST_WRITES and MAY_WRITES.  */
 
 static void
-scop_get_reads_and_writes (scop_p scop, isl_union_map *reads,
-  isl_union_map *must_writes,
-  isl_union_map *may_writes)
+scop_get_reads_and_writes (scop_p scop, isl_union_map *,
+  isl_union_map *_writes,
+  isl_union_map *_writes)
 {
   int i, j;
   poly_bb_p pbb;


[PATCH][GRAPHITE] Remove dead code

2017-10-17 Thread Richard Biener

The following removes copy_internal_parameters and the parameter rename
map.  It got dead by myself forgetting to copy the member to the
false if-region part ... and in previous mail we discussed we'd rather
wait for a testcase showing the need to handle "parameters" defined in
the region.

Bootstrap and regtest running on x86_64-unknown-linux-gnu.

Richard.

2017-10-17  Richard Biener  

* graphite-isl-ast-to-gimple.c (gcc_expression_from_isl_ast_expr_id):
Simplify with removal of the parameter rename map.
(set_rename): Likewise.
(should_copy_to_new_region): Likewise.
(graphite_copy_stmts_from_block): Likewise.
(copy_bb_and_scalar_dependences): Remove initialization of
unused copied_bb_map.
(copy_def): Remove.
(copy_internal_parameters): Likewise.
(graphite_regenerate_ast_isl): Do not call copy_internal_parameters.
* graphite-scop-detection.c (scop_detection::stmt_simple_for_scop_p):
Use INTEGRAL_TYPE_P.
(parameter_index_in_region_1): Rename to ...
(assign_parameter_index_in_region): ... this.  Assert we have
a parameter we handle.
(scan_tree_for_params): Adjust.
* sese.h (parameter_rename_map_t): Remove.
(struct sese_info_t): Remove unused parameter_rename_map and
copied_bb_map members.
* sese.c (new_sese_info): Adjust.
(free_sese_info): Likewise.

Index: gcc/graphite-isl-ast-to-gimple.c
===
--- gcc/graphite-isl-ast-to-gimple.c(revision 253811)
+++ gcc/graphite-isl-ast-to-gimple.c(working copy)
@@ -264,11 +264,9 @@ gcc_expression_from_isl_ast_expr_id (tre
  "Could not map isl_id to tree expression");
   isl_ast_expr_free (expr_id);
   tree t = res->second;
-  tree *val = region->parameter_rename_map->get(t);
-
-  if (!val)
-   val = 
-  return fold_convert (type, *val);
+  if (useless_type_conversion_p (type, TREE_TYPE (t)))
+return t;
+  return fold_convert (type, t);
 }
 
 /* Converts an isl_ast_expr_int expression E to a widest_int.
@@ -953,13 +951,6 @@ set_rename (tree old_name, tree expr)
   r.safe_push (expr);
   region->rename_map->put (old_name, r);
 }
-
-  tree t;
-  int i;
-  /* For a parameter of a scop we don't want to rename it.  */
-  FOR_EACH_VEC_ELT (region->params, i, t)
-if (old_name == t)
-  region->parameter_rename_map->put(old_name, expr);
 }
 
 /* Return an iterator to the instructions comes last in the execution order.
@@ -1138,14 +1129,6 @@ should_copy_to_new_region (gimple *stmt,
   && scev_analyzable_p (lhs, region->region))
 return false;
 
-  /* Do not copy parameters that have been generated in the header of the
- scop.  */
-  if (is_gimple_assign (stmt)
-  && (lhs = gimple_assign_lhs (stmt))
-  && TREE_CODE (lhs) == SSA_NAME
-  && region->parameter_rename_map->get(lhs))
-return false;
-
   return true;
 }
 
@@ -1214,7 +1197,7 @@ graphite_copy_stmts_from_block (basic_bl
   if (codegen_error_p ())
return false;
 
-  /* For each SSA_NAME in the parameter_rename_map rename their usage.  */
+  /* For each SCEV analyzable SSA_NAME, rename their usage.  */
   ssa_op_iter iter;
   use_operand_p use_p;
   if (!is_gimple_debug (copy))
@@ -1223,26 +1206,16 @@ graphite_copy_stmts_from_block (basic_bl
tree old_name = USE_FROM_PTR (use_p);
 
if (TREE_CODE (old_name) != SSA_NAME
-   || SSA_NAME_IS_DEFAULT_DEF (old_name))
- continue;
-
-   tree *new_expr = region->parameter_rename_map->get (old_name);
-   tree new_name;
-   if (!new_expr
-   && scev_analyzable_p (old_name, region->region))
- {
-   gimple_seq stmts = NULL;
-   new_name = get_rename_from_scev (old_name, ,
-bb->loop_father, iv_map);
-   if (! codegen_error_p ())
- gsi_insert_earliest (stmts);
-   new_expr = _name;
- }
-
-   if (!new_expr)
+   || SSA_NAME_IS_DEFAULT_DEF (old_name)
+   || ! scev_analyzable_p (old_name, region->region))
  continue;
 
-   replace_exp (use_p, *new_expr);
+   gimple_seq stmts = NULL;
+   tree new_name = get_rename_from_scev (old_name, ,
+ bb->loop_father, iv_map);
+   if (! codegen_error_p ())
+ gsi_insert_earliest (stmts);
+   replace_exp (use_p, new_name);
  }
 
   update_stmt (copy);
@@ -1288,17 +1261,6 @@ copy_bb_and_scalar_dependences (basic_bl
   gsi_insert_after (_tgt, ass, GSI_NEW_STMT);
 }
 
-  vec  *copied_bbs = region->copied_bb_map->get (bb);
-  if (copied_bbs)
-copied_bbs->safe_push (new_bb);
-  else
-{
-  vec bbs;
-  bbs.create (2);
-  

[PATCH][GRAPHITE] Fix PR82563

2017-10-17 Thread Richard Biener

PR82573 shows the ugly part of an earlier fix, that we now split the
entry edge of SCOPs during analysis phase to get a GBB for the entry
edge PHI copies.  That invalidates loop-closed SSA in some cases like
the PR.  So the following patch gets rid of that "fake" GBB by explicitely
emitting SESE entry edge copies (sources are all parameters).

This seems to remove quite some "spurious" optimized loop nests from SPEC.
I do see spurious schedule differences detected while the AST generator
still generates the same code, like for gcc.dg/graphite/interchange-1.c.
One of the cases "fixed" with this patch is

[scheduler] original schedule:
domain: "[P_2806, P_364] -> { S_294[] : -2147483648 <= P_2806 <= 
2147483647 and -9223372036854775808 <= P_364 <= 9223372036854775807; 
S_651[] : -2147483648 <= P_2806 <= 2147483647 and -9223372036854775808 <= 
P_364 <= 9223372036854775807; S_210[i33] : -2147483648 <= P_2806 <= 
2147483647 and -9223372036854775808 <= P_364 <= 9223372036854775807 and 0 
<= i33 <= 2147483645 and 4294967296*floor((-1 + P_2806)/4294967296) < 
P_2806 - i33; S_211[i34] : -2147483648 <= P_2806 <= 2147483647 and 
-9223372036854775808 <= P_364 <= 9223372036854775807 and 0 <= i34 <= 
2147483645 and 4294967296*floor((-1 + P_2806)/4294967296) < P_2806 - i34; 
S_687[] : -2147483648 <= P_2806 <= 2147483647 and -9223372036854775808 <= 
P_364 <= 9223372036854775807 }"
child:
  sequence:
  - filter: "[P_2806, P_364] -> { S_687[] }"
  - filter: "[P_2806, P_364] -> { S_210[i33] }"
child:
  schedule: "[P_2806, P_364] -> L_33[{ S_210[i33] -> [(i33)] }]"
  - filter: "[P_2806, P_364] -> { S_294[] }"
  - filter: "[P_2806, P_364] -> { S_651[] }"
  - filter: "[P_2806, P_364] -> { S_211[i34] }"
child:
  schedule: "[P_2806, P_364] -> L_34[{ S_211[i34] -> [(i34)] }]"

[scheduler] isl transformed schedule:
domain: "[P_2806, P_364] -> { S_294[] : -2147483648 <= P_2806 <= 
2147483647 and -9223372036854775808 <= P_364 <= 9223372036854775807; 
S_651[] : -2147483648 <= P_2806 <= 2147483647 and -9223372036854775808 <= 
P_364 <= 9223372036854775807; S_210[i33] : -2147483648 <= P_2806 <= 
2147483647 and -9223372036854775808 <= P_364 <= 9223372036854775807 and 0 
<= i33 <= 2147483645 and 4294967296*floor((-1 + P_2806)/4294967296) < 
P_2806 - i33; S_211[i34] : -2147483648 <= P_2806 <= 2147483647 and 
-9223372036854775808 <= P_364 <= 9223372036854775807 and 0 <= i34 <= 
2147483645 and 4294967296*floor((-1 + P_2806)/4294967296) < P_2806 - i34; 
S_687[] : -2147483648 <= P_2806 <= 2147483647 and -9223372036854775808 <= 
P_364 <= 9223372036854775807 }"
child:
  sequence:
  - filter: "[P_2806, P_364] -> { S_210[i33]; S_687[] }"
child:
  schedule: "[P_2806, P_364] -> [{ S_210[i33] -> [(i33)]; S_687[] -> 
[(0)] }]"
  permutable: 1
  child:
sequence:
- filter: "[P_2806, P_364] -> { S_687[] }"
- filter: "[P_2806, P_364] -> { S_210[i33] }"
  - filter: "[P_2806, P_364] -> { S_294[] }"
  - filter: "[P_2806, P_364] -> { S_651[] }"
  - filter: "[P_2806, P_364] -> { S_211[i34] }"
child:
  schedule: "[P_2806, P_364] -> [{ S_211[i34] -> [(i34)] }]"
  permutable: 1
  coincident: [ 1 ]

[scheduler] original ast:
{
  S_687();
  for (int c0 = 0; c0 < P_2806; c0 += 1)
S_210(c0);
  S_294();
  S_651();
  for (int c0 = 0; c0 < P_2806; c0 += 1)
S_211(c0);
}

[scheduler] AST generated by isl:
{
  S_687();
  for (int c0 = 0; c0 < P_2806; c0 += 1)
S_210(c0);
  S_294();
  S_651();
  for (int c0 = 0; c0 < P_2806; c0 += 1)
S_211(c0);
}

where with the patch S_687 () is no longer there (and S_210 depending
on it via RAW).

Overall the number of "optimized" loop nests in SPEC CPU 2006 drops
from 348 to 279.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.

Richard.

2017-10-17  Richard Biener  

PR tree-optimization/82563
* graphite-isl-ast-to-gimple.c (generate_entry_out_of_ssa_copies):
New function.
(graphite_regenerate_ast_isl): Call it.
* graphite-scop-detection.c (build_scops): Remove entry edge split.

* gcc.dg/graphite/pr82563.c: New testcase.

Index: gcc/graphite-isl-ast-to-gimple.c
===
--- gcc/graphite-isl-ast-to-gimple.c(revision 253807)
+++ gcc/graphite-isl-ast-to-gimple.c(working copy)
@@ -1501,6 +1501,35 @@ copy_internal_parameters (sese_info_p re
 }
 }
 
+/* Generate out-of-SSA copies for the entry edge FALSE_ENTRY/TRUE_ENTRY
+   in REGION.  */
+
+static void
+generate_entry_out_of_ssa_copies (edge false_entry,
+ edge true_entry,
+ sese_info_p region)
+{
+  gimple_stmt_iterator gsi_tgt = gsi_start_bb (true_entry->dest);
+  for (gphi_iterator psi = gsi_start_phis (false_entry->dest);
+   !gsi_end_p (psi); gsi_next ())
+{
+  gphi *phi = psi.phi ();
+  tree res = gimple_phi_result (phi);
+  if (virtual_operand_p (res))
+   

Re: [PATCH][GRAPHITE] Consistently use region analysis

2017-10-17 Thread Richard Biener
On Sat, 14 Oct 2017, Sebastian Pop wrote:

> On Fri, Oct 13, 2017 at 8:02 AM, Richard Biener  wrote:
> 
> >
> > Now that SCEV instantiation handles regions properly (see hunk below
> > for a minor fix) we can use it consistently from GRAPHITE and thus
> > simplify scalar_evolution_in_region greatly.
> >
> > Bootstrap and regtest running on x86_64-unknown-linux-gnu.
> >
> > A lot of the parameter renaming stuff looks dead but a more "complete"
> > patch causes some more SPEC miscompares and also a bootstrap issue
> > (warning only, but an uninitialized use of 'int tem = 0;' ...).
> >
> > This is probably all latent issues coming up more easily now.
> >
> > Note that formerly we'd support invariant "parameters" defined in
> > the region by copying those out but now SCEV instantiation should
> > lead chrec_dont_know for stuff we cannot gobble up (anythin not
> > affine).  This probably only worked for the outermost scop in the
> > region and it means we need some other way to handle those.
> 
> 
> How important is it to move defs out the region?

I have no idea...

> Can we postpone handling those cases until we have an interesting case?

That would be my preference as well.

> The
> > original issue is probably that "parameters" cannot occur in
> > dependences and thus an array index cannot "depend" on the computation
> > of a parameter (and array indexes coming from "data" cannot be handled
> > anyway?).
> 
> 
> Correct.  Parameters can occur in array indexes as long as they cancel out.
> For example, the following dependence can be computed:
> 
> A[p] vs. A[p+3]
> 
> and the following dependence cannot be computed
> 
> A[p] vs. A[0]
> 
> as the value of the parameter p is not known at compilation time.
> 
> We don't seem to have any functional testcase for those
> > parameters that are not parameters.
> >
> >
> Ok.  Let's wait for a testcase that needs this functionality.

Good.  I'll install this and hope to get some spare cycles to look
at the latent wrong-code issues that have popped up on SPEC 2k6.

Richard.

> 
> > Richard.
> >
> > 2017-10-13  Richard Biener  
> >
> > * graphite-scop-detection.c
> > (scop_detection::stmt_has_simple_data_refs_p): Always use
> > the full nest as region.
> > (try_generate_gimple_bb): Likewise.
> > (build_scops): First split edges, then compute RPO order.
> > * sese.c (scalar_evolution_in_region): Simplify now that
> > SCEV can handle instantiation in regions.
> > * tree-scalar-evolution.c (instantiate_scev_name): Also instantiate
> > in the non-loop part of a function if requested.
> >
> 
> Looks good.
> Thanks.
> 
> 
> >
> > Index: gcc/graphite-scop-detection.c
> > ===
> > --- gcc/graphite-scop-detection.c   (revision 253721)
> > +++ gcc/graphite-scop-detection.c   (working copy)
> > @@ -1005,15 +1005,10 @@ scop_detection::graphite_can_represent_e
> >  bool
> >  scop_detection::stmt_has_simple_data_refs_p (sese_l scop, gimple *stmt)
> >  {
> > -  edge nest;
> > +  edge nest = scop.entry;;
> >loop_p loop = loop_containing_stmt (stmt);
> >if (!loop_in_sese_p (loop, scop))
> > -{
> > -  nest = scop.entry;
> > -  loop = NULL;
> > -}
> > -  else
> > -nest = loop_preheader_edge (outermost_loop_in_sese (scop, gimple_bb
> > (stmt)));
> > +loop = NULL;
> >
> >auto_vec drs;
> >if (! graphite_find_data_references_in_stmt (nest, loop, stmt, ))
> > @@ -1381,15 +1350,10 @@ try_generate_gimple_bb (scop_p scop, bas
> >vec reads = vNULL;
> >
> >sese_l region = scop->scop_info->region;
> > -  edge nest;
> > +  edge nest = region.entry;
> >loop_p loop = bb->loop_father;
> >if (!loop_in_sese_p (loop, region))
> > -{
> > -  nest = region.entry;
> > -  loop = NULL;
> > -}
> > -  else
> > -nest = loop_preheader_edge (outermost_loop_in_sese (region, bb));
> > +loop = NULL;
> >
> >for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
> > gsi_next ())
> > @@ -1696,6 +1660,13 @@ build_scops (vec *scops)
> >/* Now create scops from the lightweight SESEs.  */
> >vec scops_l = sb.get_scops ();
> >
> > +  /* For our out-of-SSA we need a block on s->entry, similar to how
> > + we include the LCSSA block in the region.  */
> > +  int i;
> > +  sese_l *s;
> > +  FOR_EACH_VEC_ELT (scops_l, i, s)
> > +s->entry = single_pred_edge (split_edge (s->entry));
> > +
> >/* Domwalk needs a bb to RPO mapping.  Compute it once here.  */
> >int *postorder = XNEWVEC (int, n_basic_blocks_for_fn (cfun));
> >int postorder_num = pre_and_rev_post_order_compute (NULL, postorder,
> > true);
> > @@ -1704,14 +1675,8 @@ build_scops (vec *scops)
> >  bb_to_rpo[postorder[i]] = i;
> >free (postorder);
> >
> > -  int i;
> > -  sese_l *s;
> >FOR_EACH_VEC_ELT (scops_l, i, s)
> >  {
> > -  /* For our 

Re: [PATCH][GRAPHITE] Consistently use region analysis

2017-10-14 Thread Sebastian Pop
On Fri, Oct 13, 2017 at 8:02 AM, Richard Biener  wrote:

>
> Now that SCEV instantiation handles regions properly (see hunk below
> for a minor fix) we can use it consistently from GRAPHITE and thus
> simplify scalar_evolution_in_region greatly.
>
> Bootstrap and regtest running on x86_64-unknown-linux-gnu.
>
> A lot of the parameter renaming stuff looks dead but a more "complete"
> patch causes some more SPEC miscompares and also a bootstrap issue
> (warning only, but an uninitialized use of 'int tem = 0;' ...).
>
> This is probably all latent issues coming up more easily now.
>
> Note that formerly we'd support invariant "parameters" defined in
> the region by copying those out but now SCEV instantiation should
> lead chrec_dont_know for stuff we cannot gobble up (anythin not
> affine).  This probably only worked for the outermost scop in the
> region and it means we need some other way to handle those.


How important is it to move defs out the region?
Can we postpone handling those cases until we have an interesting case?

The
> original issue is probably that "parameters" cannot occur in
> dependences and thus an array index cannot "depend" on the computation
> of a parameter (and array indexes coming from "data" cannot be handled
> anyway?).


Correct.  Parameters can occur in array indexes as long as they cancel out.
For example, the following dependence can be computed:

A[p] vs. A[p+3]

and the following dependence cannot be computed

A[p] vs. A[0]

as the value of the parameter p is not known at compilation time.

We don't seem to have any functional testcase for those
> parameters that are not parameters.
>
>
Ok.  Let's wait for a testcase that needs this functionality.


> Richard.
>
> 2017-10-13  Richard Biener  
>
> * graphite-scop-detection.c
> (scop_detection::stmt_has_simple_data_refs_p): Always use
> the full nest as region.
> (try_generate_gimple_bb): Likewise.
> (build_scops): First split edges, then compute RPO order.
> * sese.c (scalar_evolution_in_region): Simplify now that
> SCEV can handle instantiation in regions.
> * tree-scalar-evolution.c (instantiate_scev_name): Also instantiate
> in the non-loop part of a function if requested.
>

Looks good.
Thanks.


>
> Index: gcc/graphite-scop-detection.c
> ===
> --- gcc/graphite-scop-detection.c   (revision 253721)
> +++ gcc/graphite-scop-detection.c   (working copy)
> @@ -1005,15 +1005,10 @@ scop_detection::graphite_can_represent_e
>  bool
>  scop_detection::stmt_has_simple_data_refs_p (sese_l scop, gimple *stmt)
>  {
> -  edge nest;
> +  edge nest = scop.entry;;
>loop_p loop = loop_containing_stmt (stmt);
>if (!loop_in_sese_p (loop, scop))
> -{
> -  nest = scop.entry;
> -  loop = NULL;
> -}
> -  else
> -nest = loop_preheader_edge (outermost_loop_in_sese (scop, gimple_bb
> (stmt)));
> +loop = NULL;
>
>auto_vec drs;
>if (! graphite_find_data_references_in_stmt (nest, loop, stmt, ))
> @@ -1381,15 +1350,10 @@ try_generate_gimple_bb (scop_p scop, bas
>vec reads = vNULL;
>
>sese_l region = scop->scop_info->region;
> -  edge nest;
> +  edge nest = region.entry;
>loop_p loop = bb->loop_father;
>if (!loop_in_sese_p (loop, region))
> -{
> -  nest = region.entry;
> -  loop = NULL;
> -}
> -  else
> -nest = loop_preheader_edge (outermost_loop_in_sese (region, bb));
> +loop = NULL;
>
>for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
> gsi_next ())
> @@ -1696,6 +1660,13 @@ build_scops (vec *scops)
>/* Now create scops from the lightweight SESEs.  */
>vec scops_l = sb.get_scops ();
>
> +  /* For our out-of-SSA we need a block on s->entry, similar to how
> + we include the LCSSA block in the region.  */
> +  int i;
> +  sese_l *s;
> +  FOR_EACH_VEC_ELT (scops_l, i, s)
> +s->entry = single_pred_edge (split_edge (s->entry));
> +
>/* Domwalk needs a bb to RPO mapping.  Compute it once here.  */
>int *postorder = XNEWVEC (int, n_basic_blocks_for_fn (cfun));
>int postorder_num = pre_and_rev_post_order_compute (NULL, postorder,
> true);
> @@ -1704,14 +1675,8 @@ build_scops (vec *scops)
>  bb_to_rpo[postorder[i]] = i;
>free (postorder);
>
> -  int i;
> -  sese_l *s;
>FOR_EACH_VEC_ELT (scops_l, i, s)
>  {
> -  /* For our out-of-SSA we need a block on s->entry, similar to how
> - we include the LCSSA block in the region.  */
> -  s->entry = single_pred_edge (split_edge (s->entry));
> -
>scop_p scop = new_scop (s->entry, s->exit);
>
>/* Record all basic blocks and their conditions in REGION.  */
> Index: gcc/sese.c
> ===
> --- gcc/sese.c  (revision 253721)
> +++ gcc/sese.c  (working copy)
> @@ -459,41 +447,16 @@ scev_analyzable_p (tree 

[PATCH][GRAPHITE] Consistently use region analysis

2017-10-13 Thread Richard Biener

Now that SCEV instantiation handles regions properly (see hunk below
for a minor fix) we can use it consistently from GRAPHITE and thus
simplify scalar_evolution_in_region greatly.

Bootstrap and regtest running on x86_64-unknown-linux-gnu.

A lot of the parameter renaming stuff looks dead but a more "complete"
patch causes some more SPEC miscompares and also a bootstrap issue
(warning only, but an uninitialized use of 'int tem = 0;' ...).

This is probably all latent issues coming up more easily now.

Note that formerly we'd support invariant "parameters" defined in
the region by copying those out but now SCEV instantiation should
lead chrec_dont_know for stuff we cannot gobble up (anythin not
affine).  This probably only worked for the outermost scop in the
region and it means we need some other way to handle those.  The
original issue is probably that "parameters" cannot occur in
dependences and thus an array index cannot "depend" on the computation
of a parameter (and array indexes coming from "data" cannot be handled
anyway?).  We don't seem to have any functional testcase for those
parameters that are not parameters.

Richard.

2017-10-13  Richard Biener  

* graphite-scop-detection.c
(scop_detection::stmt_has_simple_data_refs_p): Always use
the full nest as region.
(try_generate_gimple_bb): Likewise.
(build_scops): First split edges, then compute RPO order.
* sese.c (scalar_evolution_in_region): Simplify now that
SCEV can handle instantiation in regions.
* tree-scalar-evolution.c (instantiate_scev_name): Also instantiate
in the non-loop part of a function if requested.

Index: gcc/graphite-scop-detection.c
===
--- gcc/graphite-scop-detection.c   (revision 253721)
+++ gcc/graphite-scop-detection.c   (working copy)
@@ -1005,15 +1005,10 @@ scop_detection::graphite_can_represent_e
 bool
 scop_detection::stmt_has_simple_data_refs_p (sese_l scop, gimple *stmt)
 {
-  edge nest;
+  edge nest = scop.entry;;
   loop_p loop = loop_containing_stmt (stmt);
   if (!loop_in_sese_p (loop, scop))
-{
-  nest = scop.entry;
-  loop = NULL;
-}
-  else
-nest = loop_preheader_edge (outermost_loop_in_sese (scop, gimple_bb 
(stmt)));
+loop = NULL;
 
   auto_vec drs;
   if (! graphite_find_data_references_in_stmt (nest, loop, stmt, ))
@@ -1381,15 +1350,10 @@ try_generate_gimple_bb (scop_p scop, bas
   vec reads = vNULL;
 
   sese_l region = scop->scop_info->region;
-  edge nest;
+  edge nest = region.entry;
   loop_p loop = bb->loop_father;
   if (!loop_in_sese_p (loop, region))
-{
-  nest = region.entry;
-  loop = NULL;
-}
-  else
-nest = loop_preheader_edge (outermost_loop_in_sese (region, bb));
+loop = NULL;
 
   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
gsi_next ())
@@ -1696,6 +1660,13 @@ build_scops (vec *scops)
   /* Now create scops from the lightweight SESEs.  */
   vec scops_l = sb.get_scops ();
 
+  /* For our out-of-SSA we need a block on s->entry, similar to how
+ we include the LCSSA block in the region.  */
+  int i;
+  sese_l *s;
+  FOR_EACH_VEC_ELT (scops_l, i, s)
+s->entry = single_pred_edge (split_edge (s->entry));
+
   /* Domwalk needs a bb to RPO mapping.  Compute it once here.  */
   int *postorder = XNEWVEC (int, n_basic_blocks_for_fn (cfun));
   int postorder_num = pre_and_rev_post_order_compute (NULL, postorder, true);
@@ -1704,14 +1675,8 @@ build_scops (vec *scops)
 bb_to_rpo[postorder[i]] = i;
   free (postorder);
 
-  int i;
-  sese_l *s;
   FOR_EACH_VEC_ELT (scops_l, i, s)
 {
-  /* For our out-of-SSA we need a block on s->entry, similar to how
- we include the LCSSA block in the region.  */
-  s->entry = single_pred_edge (split_edge (s->entry));
-
   scop_p scop = new_scop (s->entry, s->exit);
 
   /* Record all basic blocks and their conditions in REGION.  */
Index: gcc/sese.c
===
--- gcc/sese.c  (revision 253721)
+++ gcc/sese.c  (working copy)
@@ -459,41 +447,16 @@ scev_analyzable_p (tree def, sese_l 
 tree
 scalar_evolution_in_region (const sese_l , loop_p loop, tree t)
 {
-  gimple *def;
-  struct loop *def_loop;
-
   /* SCOP parameters.  */
   if (TREE_CODE (t) == SSA_NAME
   && !defined_in_sese_p (t, region))
 return t;
 
-  if (TREE_CODE (t) != SSA_NAME
-  || loop_in_sese_p (loop, region))
-/* FIXME: we would need instantiate SCEV to work on a region, and be more
-   flexible wrt. memory loads that may be invariant in the region.  */
-return instantiate_scev (region.entry, loop,
-analyze_scalar_evolution (loop, t));
-
-  def = SSA_NAME_DEF_STMT (t);
-  def_loop = loop_containing_stmt (def);
-
-  if (loop_in_sese_p (def_loop, region))
-{
-  t = analyze_scalar_evolution (def_loop, t);
-   

[PATCH][GRAPHITE] Fix SSA update

2017-10-13 Thread Richard Biener

This is something I wanted to do later just as compile-time optimization
but it turns out it is necessary for correctness if we want to keep
the current order of creating SCOPs and analyzing data references and
parameters and only after that code-generating SCOPs that were optimized.

This is because what SSA names are the parameters really depens on the
IL and liveout PHIs for transformed SESE regions changes the situation
enough that use "stale" information.

Of course the issue isn't one if we do the transform in "one step"
because update-SSA can just cope with that.

In the process I simplified the main graphite function to inline
the initialize/finalize stuff, removing a weird parameter that
ended up PASSing gcc.dg/graphite/pr35356-3.c (with just one loop)
so I XFAILed that again.

I've added gcc.dg/graphite/pr81373-2.c which ICEs before this patch.

Bootstrapped and tested on x86_64-unknown-linux-gnu, SPEC is happy,
applied.

Richard.

2017-10-13  Richard Biener  

* graphite-isl-ast-to-gimple.c
(translate_isl_ast_to_gimple::get_rename_from_scev): Remove unused
parameters and dominance check.
(translate_isl_ast_to_gimple::graphite_copy_stmts_from_block): Adjust.
(translate_isl_ast_to_gimple::copy_bb_and_scalar_dependences): Likewise.
(translate_isl_ast_to_gimple::graphite_regenerate_ast_isl):
Do not update SSA form here or do intermediate IL verification.
* graphite.c: Include tree-ssa.h and tree-into-ssa.h.
(graphite_initialize): Remove check on the number of loops in
the function and inline into graphite_transform_loops.
(graphite_finalize): Inline into graphite_transform_loops.
(graphite_transform_loops): Perform SSA update and IL verification
here.
* params.def (PARAM_GRAPHITE_MIN_LOOPS_PER_FUNCTION): Remove.

* gcc.dg/graphite/pr35356-3.c: XFAIL again.
* gcc.dg/graphite/pr81373-2.c: Copy from gcc.dg/graphite/pr81373.c
with alternate flags.

Index: gcc/graphite-isl-ast-to-gimple.c
===
--- gcc/graphite-isl-ast-to-gimple.c(revision 253719)
+++ gcc/graphite-isl-ast-to-gimple.c(working copy)
@@ -189,7 +189,6 @@ class translate_isl_ast_to_gimple
   __isl_give isl_ast_node * scop_to_isl_ast (scop_p scop);
 
   tree get_rename_from_scev (tree old_name, gimple_seq *stmts, loop_p loop,
-basic_block new_bb, basic_block old_bb,
 vec iv_map);
   bool graphite_copy_stmts_from_block (basic_block bb, basic_block new_bb,
   vec iv_map);
@@ -1084,7 +1083,6 @@ gsi_insert_earliest (gimple_seq seq)
 
 tree translate_isl_ast_to_gimple::
 get_rename_from_scev (tree old_name, gimple_seq *stmts, loop_p loop,
- basic_block new_bb, basic_block,
  vec iv_map)
 {
   tree scev = scalar_evolution_in_region (region->region, loop, old_name);
@@ -1113,16 +,6 @@ get_rename_from_scev (tree old_name, gim
   return build_zero_cst (TREE_TYPE (old_name));
 }
 
-  if (TREE_CODE (new_expr) == SSA_NAME)
-{
-  basic_block bb = gimple_bb (SSA_NAME_DEF_STMT (new_expr));
-  if (bb && !dominated_by_p (CDI_DOMINATORS, new_bb, bb))
-   {
- set_codegen_error ();
- return build_zero_cst (TREE_TYPE (old_name));
-   }
-}
-
   /* Replace the old_name with the new_expr.  */
   return force_gimple_operand (unshare_expr (new_expr), stmts,
   true, NULL_TREE);
@@ -1245,8 +1233,7 @@ graphite_copy_stmts_from_block (basic_bl
  {
gimple_seq stmts = NULL;
new_name = get_rename_from_scev (old_name, ,
-bb->loop_father,
-new_bb, bb, iv_map);
+bb->loop_father, iv_map);
if (! codegen_error_p ())
  gsi_insert_earliest (stmts);
new_expr = _name;
@@ -1361,7 +1348,7 @@ copy_bb_and_scalar_dependences (basic_bl
  gimple_seq stmts = NULL;
  tree new_name = get_rename_from_scev (arg, ,
bb->loop_father,
-   new_bb, bb, iv_map);
+   iv_map);
  if (! codegen_error_p ())
gsi_insert_earliest (stmts);
  arg = new_name;
@@ -1567,17 +1554,6 @@ graphite_regenerate_ast_isl (scop_p scop
 if_region->true_region->region.exit);
   if (dump_file)
fprintf (dump_file, "[codegen] isl AST to Gimple succeeded.\n");
-
-  mark_virtual_operands_for_renaming (cfun);
-  update_ssa (TODO_update_ssa);
-  checking_verify_ssa (true, 

[PATCH][GRAPHITE] Some TLC

2017-10-13 Thread Richard Biener

Removing a global constructor, a return value that isn't checked
and adjusting testcases that spew -Waggressive-loop-optimization
warnings when built with different options.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.

Richard.

2017-10-13  Richard Biener  

* graphite-isl-ast-to-gimple.c (max_mode_int_precision,
graphite_expression_type_precision): Avoid global constructor
by moving ...
(translate_isl_ast_to_gimple::translate_isl_ast_to_gimple): Here.
(translate_isl_ast_to_gimple::graphite_expr_type): Add type
member.
(translate_isl_ast_to_gimple::translate_isl_ast_node_for): Use it.
(translate_isl_ast_to_gimple::build_iv_mapping): Likewise.
(translate_isl_ast_to_gimple::graphite_create_new_guard): Likewise.
* graphite-sese-to-poly.c (build_original_schedule): Return nothing.

* gcc.dg/graphite/scop-10.c: Enlarge array to avoid undefined
behavior.
* gcc.dg/graphite/scop-7.c: Likewise.
* gcc.dg/graphite/scop-8.c: Likewise.

Index: gcc/graphite-isl-ast-to-gimple.c
===
--- gcc/graphite-isl-ast-to-gimple.c(revision 253707)
+++ gcc/graphite-isl-ast-to-gimple.c(working copy)
@@ -58,15 +58,6 @@ along with GCC; see the file COPYING3.
 #include "tree-ssa.h"
 #include "graphite.h"
 
-/* We always try to use signed 128 bit types, but fall back to smaller types
-   in case a platform does not provide types of these sizes. In the future we
-   should use isl to derive the optimal type for each subexpression.  */
-
-static int max_mode_int_precision =
-  GET_MODE_PRECISION (int_mode_for_size (MAX_FIXED_MODE_SIZE, 0).require ());
-static int graphite_expression_type_precision = 128 <= max_mode_int_precision ?
-   128 : max_mode_int_precision;
-
 struct ast_build_info
 {
   ast_build_info()
@@ -143,8 +134,7 @@ enum phi_node_kind
 class translate_isl_ast_to_gimple
 {
  public:
-  translate_isl_ast_to_gimple (sese_info_p r)
-: region (r), codegen_error (false) { }
+  translate_isl_ast_to_gimple (sese_info_p r);
   edge translate_isl_ast (loop_p context_loop, __isl_keep isl_ast_node *node,
  edge next_e, ivs_params );
   edge translate_isl_ast_node_for (loop_p context_loop,
@@ -235,8 +225,24 @@ private:
 
   /* A vector of all the edges at if_condition merge points.  */
   auto_vec merge_points;
+
+  tree graphite_expr_type;
 };
 
+translate_isl_ast_to_gimple::translate_isl_ast_to_gimple (sese_info_p r)
+  : region (r), codegen_error (false)
+{
+  /* We always try to use signed 128 bit types, but fall back to smaller types
+ in case a platform does not provide types of these sizes. In the future we
+ should use isl to derive the optimal type for each subexpression.  */
+  int max_mode_int_precision
+= GET_MODE_PRECISION (int_mode_for_size (MAX_FIXED_MODE_SIZE, 0).require 
());
+  int graphite_expr_type_precision
+= 128 <= max_mode_int_precision ?  128 : max_mode_int_precision;
+  graphite_expr_type
+= build_nonstandard_integer_type (graphite_expr_type_precision, 0);
+}
+
 /* Return the tree variable that corresponds to the given isl ast identifier
expression (an isl_ast_expr of type isl_ast_expr_id).
 
@@ -702,8 +708,7 @@ translate_isl_ast_node_for (loop_p conte
edge next_e, ivs_params )
 {
   gcc_assert (isl_ast_node_get_type (node) == isl_ast_node_for);
-  tree type
-= build_nonstandard_integer_type (graphite_expression_type_precision, 0);
+  tree type = graphite_expr_type;
 
   isl_ast_expr *for_init = isl_ast_node_for_get_init (node);
   tree lb = gcc_expression_from_isl_expression (type, for_init, ip);
@@ -742,8 +747,7 @@ build_iv_mapping (vec iv_map, gimp
   for (i = 1; i < isl_ast_expr_get_op_n_arg (user_expr); i++)
 {
   arg_expr = isl_ast_expr_get_op_arg (user_expr, i);
-  tree type =
-   build_nonstandard_integer_type (graphite_expression_type_precision, 0);
+  tree type = graphite_expr_type;
   tree t = gcc_expression_from_isl_expression (type, arg_expr, ip);
 
   /* To fail code generation, we generate wrong code until we discard it.  
*/
@@ -841,8 +845,7 @@ edge translate_isl_ast_to_gimple::
 graphite_create_new_guard (edge entry_edge, __isl_take isl_ast_expr *if_cond,
   ivs_params )
 {
-  tree type =
-build_nonstandard_integer_type (graphite_expression_type_precision, 0);
+  tree type = graphite_expr_type;
   tree cond_expr = gcc_expression_from_isl_expression (type, if_cond, ip);
 
   /* To fail code generation, we generate wrong code until we discard it.  */
Index: gcc/graphite-sese-to-poly.c
===
--- gcc/graphite-sese-to-poly.c (revision 253707)
+++ gcc/graphite-sese-to-poly.c (working copy)
@@ -1194,7 +1194,7 @@ 

Re: [PATCH][GRAPHITE] Fix PR82525

2017-10-12 Thread Sebastian Pop
On Oct 12, 2017 4:36 AM, "Richard Biener"  wrote:


The following avoids code-generation errors for modulo operations
resulting from our own constraints ending up as no-ops because
the type we code-generate in already imposes the modulo operation.

For the case in SPEC 2k6 triggering this we'd even know the
modulo constraint isn't necessary - we have

 int64_t lower, upper;
 if (lower < upper)
   uint64_t niter = (uint64_t)upper - (uint64_t)lower;

but there's no way to represent in GIMPLE that subtracting
the two signed values will yield a positive value fitting
in the corresponding unsigned type...  We'd need sth
like a MINUSU_EXPR (like the often proposed ABSU_EXPR or
the proposed POINTER_DIFF_EXPR).

This fixes the last code generation errors with SPEC CPU 2006
and -fgraphite-identity -floop-nest-optimize.

loop nest optimized: 483
loop nest not optimized, code generation error: 0
loop nest not optimized, optimized schedule is identical to original
schedule: 173
loop nest not optimized, optimization timed out: 60
loop nest not optimized, ISL signalled an error: 9
loop nest: 725

Note that we still (with and without this patch) get miscompares
in 465.tonto, 416.gamess and 403.gcc (we have those "wrong"
constraint thing leading to empty domains if you remember).

Bootstrap and regtest running on x86_64-unknown-linux-gnu, ok?

Thanks,
Richard.

2017-10-12  Richard Biener  

PR tree-optimization/82525
* graphite-isl-ast-to-gimple.c
(translate_isl_ast_to_gimple::widest_int_from_isl_expr_int): Split
out from ...
(translate_isl_ast_to_gimple::gcc_expression_from_isl_expr_int):
Here.
Fail code generation when we cannot represent the isl integer.
(binary_op_to_tree): Elide modulo operations that are no-ops
in the type we code generate.  Remove now superfluous code
generation errors.

* gcc.dg/graphite/id-30.c: New testcase.
* gfortran.dg/graphite/id-28.f90: Likewise.


Looks good.


Index: gcc/graphite-isl-ast-to-gimple.c
===
--- gcc/graphite-isl-ast-to-gimple.c(revision 253645)
+++ gcc/graphite-isl-ast-to-gimple.c(working copy)
@@ -177,6 +177,7 @@ class translate_isl_ast_to_gimple
   tree gcc_expression_from_isl_ast_expr_id (tree type,
__isl_keep isl_ast_expr
*expr_id,
ivs_params );
+  widest_int widest_int_from_isl_expr_int (__isl_keep isl_ast_expr *expr);
   tree gcc_expression_from_isl_expr_int (tree type,
 __isl_take isl_ast_expr *expr);
   tree gcc_expression_from_isl_expr_op (tree type,
@@ -265,29 +266,46 @@ gcc_expression_from_isl_ast_expr_id (tre
   return fold_convert (type, *val);
 }

-/* Converts an isl_ast_expr_int expression E to a GCC expression tree of
-   type TYPE.  */
+/* Converts an isl_ast_expr_int expression E to a widest_int.
+   Raises a code generation error when the constant doesn't fit.  */

-tree translate_isl_ast_to_gimple::
-gcc_expression_from_isl_expr_int (tree type, __isl_take isl_ast_expr *expr)
+widest_int translate_isl_ast_to_gimple::
+widest_int_from_isl_expr_int (__isl_keep isl_ast_expr *expr)
 {
   gcc_assert (isl_ast_expr_get_type (expr) == isl_ast_expr_int);
   isl_val *val = isl_ast_expr_get_val (expr);
   size_t n = isl_val_n_abs_num_chunks (val, sizeof (HOST_WIDE_INT));
   HOST_WIDE_INT *chunks = XALLOCAVEC (HOST_WIDE_INT, n);
-  tree res;
-  if (isl_val_get_abs_num_chunks (val, sizeof (HOST_WIDE_INT), chunks) ==
-1)
-res = NULL_TREE;
-  else
+  if (n > WIDE_INT_MAX_ELTS
+  || isl_val_get_abs_num_chunks (val, sizeof (HOST_WIDE_INT), chunks)
== -1)
 {
-  widest_int wi = widest_int::from_array (chunks, n, true);
-  if (isl_val_is_neg (val))
-   wi = -wi;
-  res = wide_int_to_tree (type, wi);
+  isl_val_free (val);
+  set_codegen_error ();
+  return 0;
 }
+  widest_int wi = widest_int::from_array (chunks, n, true);
+  if (isl_val_is_neg (val))
+wi = -wi;
   isl_val_free (val);
+  return wi;
+}
+
+/* Converts an isl_ast_expr_int expression E to a GCC expression tree of
+   type TYPE.  Raises a code generation error when the constant doesn't
fit.  */
+
+tree translate_isl_ast_to_gimple::
+gcc_expression_from_isl_expr_int (tree type, __isl_take isl_ast_expr *expr)
+{
+  widest_int wi = widest_int_from_isl_expr_int (expr);
   isl_ast_expr_free (expr);
-  return res;
+  if (codegen_error_p ())
+return NULL_TREE;
+  if (wi::min_precision (wi, TYPE_SIGN (type)) > TYPE_PRECISION (type))
+{
+  set_codegen_error ();
+  return NULL_TREE;
+}
+  return wide_int_to_tree (type, wi);
 }

 /* Converts a binary isl_ast_expr_op expression E to a GCC expression tree
of
@@ -296,14 +314,25 @@ gcc_expression_from_isl_expr_int (tree t
 tree translate_isl_ast_to_gimple::
 binary_op_to_tree (tree type, 

Re: [PATCH][GRAPHITE] Fix PR69728 in "another" way

2017-10-12 Thread Sebastian Pop
On Oct 12, 2017 9:08 AM, "Richard Biener"  wrote:


I made scheduling to fail when we end up with an empty domain but as
I forgot to actually check the return value of build_original_schedule
the fix was equivalent to just doing nothing to the schedule when
it has an empty domain.  I verified that for the testcase it DCEs
the relevant stmt and that this is a valid transform.

Bootstrapped and tested on x86_64-unknown-linux-gnu, SPEC 2k6 is also
happy with this.

Committed as obvious (since no functional change).

Richard.

2017-10-12  Richard Biener  

PR tree-optimization/69728
Revert
2017-09-19  Richard Biener  

PR tree-optimization/69728
* graphite-sese-to-poly.c (schedule_error): New global.
(add_loop_schedule): Handle empty domain by failing the
schedule.
(build_original_schedule): Handle schedule_error.

* graphite-sese-to-poly.c (add_loop_schedule): Handle empty
domain by returning an unchanged schedule.

* gcc.dg/graphite/pr69728.c: Adjust to reflect we can handle
the loop now.  Remove unrelated undefined behavior.


Looks good.


Index: gcc/graphite-sese-to-poly.c
===
--- gcc/graphite-sese-to-poly.c (revision 253645)
+++ gcc/graphite-sese-to-poly.c (working copy)
@@ -1066,8 +1051,6 @@ outer_projection_mupa (__isl_take isl_un
   return isl_multi_union_pw_aff_from_union_pw_multi_aff (data.res);
 }

-static bool schedule_error;
-
 /* Embed SCHEDULE in the constraints of the LOOP domain.  */

 static isl_schedule *
@@ -1082,11 +1065,9 @@ add_loop_schedule (__isl_take isl_schedu
 return empty < 0 ? isl_schedule_free (schedule) : schedule;

   isl_union_set *domain = isl_schedule_get_domain (schedule);
-  /* We cannot apply an empty domain to pbbs in this loop so fail.
- ??? Somehow drop pbbs in the loop instead.  */
+  /* We cannot apply an empty domain to pbbs in this loop so return
early.  */
   if (isl_union_set_is_empty (domain))
 {
-  schedule_error = true;
   isl_union_set_free (domain);
   return schedule;
 }
@@ -1216,8 +1197,6 @@ build_schedule_loop_nest (scop_p scop, i
 static bool
 build_original_schedule (scop_p scop)
 {
-  schedule_error = false;
-
   int i = 0;
   int n = scop->pbbs.length ();
   while (i < n)
@@ -1232,14 +1211,6 @@ build_original_schedule (scop_p scop)
   scop->original_schedule = add_in_sequence (scop->original_schedule,
s);
 }

-  if (schedule_error)
-{
-  if (dump_file)
-   fprintf (dump_file, "[sese-to-poly] failed to build "
-"original schedule\n");
-  return false;
-}
-
   if (dump_file)
 {
   fprintf (dump_file, "[sese-to-poly] original schedule:\n");
Index: gcc/testsuite/gcc.dg/graphite/pr69728.c
===
--- gcc/testsuite/gcc.dg/graphite/pr69728.c (revision 253645)
+++ gcc/testsuite/gcc.dg/graphite/pr69728.c (working copy)
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
-/* { dg-options "-O3 -floop-nest-optimize" } */
+/* { dg-options "-O3 -floop-nest-optimize -fdump-tree-graphite-details" }
*/

-int a[1];
+int a[9];
 int b, c, d, e;
 void
 fn1 ()
@@ -19,3 +19,9 @@ fn1 ()
}
 }
 }
+
+/* At the moment only ISL figures that if (d) is always true.  We've
+   run into scheduling issues before here, not being able to handle
+   empty domains.  */
+
+/* { dg-final { scan-tree-dump "loop nest optimized" "graphite" } }  */


Re: [PATCH][GRAPHITE] Lift some IV restrictions

2017-10-12 Thread Sebastian Pop
On Oct 12, 2017 9:29 AM, "Richard Biener"  wrote:


The type check seems premature (we're checking CHRECs already) and
we certainly can handle POINTER IVs just fine.

Bootstrap / regtest running on x86_64-unknown-linux-gnu.

SPEC CPU 2k6 sees ~100 more loop nest optimizations that way.

Ok?

[I'd rather have problematical testcases for those weird
restrictions]


Sounds good.
Thanks.


Thanks,
Richard.

2017-10-12  Richard Biener  

* graphite-scop-detection.c (loop_ivs_can_be_represented): Remove.
(scop_detection::harmful_loop_in_region): Remove premature
IV type restriction.
(scop_detection::graphite_can_represent_scev): We can handle
pointer IVs just fine.

Index: gcc/graphite-scop-detection.c
===
--- gcc/graphite-scop-detection.c   (revision 253676)
+++ gcc/graphite-scop-detection.c   (working copy)
@@ -254,28 +254,6 @@ dot_cfg ()
   scops.release ();
 }

-/* Can all ivs be represented by a signed integer?
-   As isl might generate negative values in its expressions, signed loop
ivs
-   are required in the backend.  */
-
-static bool
-loop_ivs_can_be_represented (loop_p loop)
-{
-  unsigned type_long_long = TYPE_PRECISION (long_long_integer_type_node);
-  for (gphi_iterator psi = gsi_start_phis (loop->header); !gsi_end_p (psi);
-   gsi_next ())
-{
-  gphi *phi = psi.phi ();
-  tree res = PHI_RESULT (phi);
-  tree type = TREE_TYPE (res);
-
-  if (TYPE_UNSIGNED (type) && TYPE_PRECISION (type) >= type_long_long)
-   return false;
-}
-
-  return true;
-}
-
 /* Returns a COND_EXPR statement when BB has a single predecessor, the
edge between BB and its predecessor is not a loop exit edge, and
the last statement of the single predecessor is a COND_EXPR.  */
@@ -822,13 +800,6 @@ scop_detection::harmful_loop_in_region (
  return true;
}

-  if (! loop_ivs_can_be_represented (loop))
-   {
- DEBUG_PRINT (dp << "[scop-detection-fail] loop_" << loop->num
-  << "IV cannot be represented.\n");
- return true;
-   }
-
   /* Check if all loop nests have at least one data reference.
 ???  This check is expensive and loops premature at this point.
 If important to retain we can pre-compute this for all innermost
@@ -968,14 +939,6 @@ scop_detection::graphite_can_represent_s
   if (chrec_contains_undetermined (scev))
 return false;

-  /* We disable the handling of pointer types, because it’s currently not
- supported by Graphite with the isl AST generator. SSA_NAME nodes are
- the only nodes, which are disabled in case they are pointers to object
- types, but this can be changed.  */
-
-  if (POINTER_TYPE_P (TREE_TYPE (scev)) && TREE_CODE (scev) == SSA_NAME)
-return false;
-
   switch (TREE_CODE (scev))
 {
 case NEGATE_EXPR:


Re: [PATCH][GRAPHITE] Fix PR82451 (and PR82355 in a different way)

2017-10-12 Thread Sebastian Pop
On Oct 11, 2017 9:43 AM, "Richard Biener"  wrote:


For PR82355 I introduced a fake dimension to ISL to allow CHRECs
having an evolution in a loop that isn't fully part of the SESE
region we are processing.  That was easier than fending off those
CHRECs (without simply giving up on SESE regions with those).

But it didn't fully solve the issue as PR82451 shows where we run
into the issue that we eventually have to code-gen those
evolutions and thus in theory need a canonical IV of that containing loop.

So I decided (after Micha pressuring me a bit...) to revisit the
original issue and make SCEV analysis "properly" handle SE regions.
It turns out that it is mostly instantiate_scev lacking proper support
plus the necessary interfacing change (really just cosmetic in some sense)
from a instantiate_before basic-block to a instantiate_before edge.


Very nice.


data-ref interfaces have been similarly adjusted, here changing
the "loop nest" loop parameter to the entry edge for the SE region
and passing that down accordingly.

I've for now tried to keep other high-level loop-based interfaces the
same by simply using the loop preheader edge as entry where appropriate
(needing loop_preheader_edge cope with the loop root tree for simplicity).

In the process I ran into issues with us too overly aggressive
instantiating random trees and thus I cut those down.  That part
doesn't successfully test separately (when I remove the strange
ARRAY_REF instantiation), so it's part of this patch.  I've also
run into an SSA verification fail (the id-27.f90 testcase) which
shows we _do_ need to clear the SCEV cache after introducing
the versioned CFG (and added a comment before it).

On the previously failing testcases I've verified we produce
sensible instantiations for those pesky refs residing in "no" loop
in the SCOP and that we get away with the result in terms of
optimizing.

SPEC 2k6 testing shows

loop nest optimized: 311
loop nest not optimized, code generation error: 0
loop nest not optimized, optimized schedule is identical to original
schedule: 173
loop nest not optimized, optimization timed out: 59
loop nest not optimized, ISL signalled an error: 9
loop nest: 552

for SPEC 2k6 and -floop-nest-optimize while adding -fgraphite-identity
still reveals some codegen errors:

loop nest optimized: 437
loop nest not optimized, code generation error: 25
loop nest not optimized, optimized schedule is identical to original
schedule: 169
loop nest not optimized, optimization timed out: 60
loop nest not optimized, ISL signalled an error: 9
loop nest: 700

Bootstrap and regtest in progress on x86_64-unknown-linux-gnu
(with and without -fgraphite-identity -floop-nest-optimize).

Ok?


Looks good to me.
Thanks.


Thanks,
Richard.

2017-10-11  Richard Biener  

PR tree-optimization/82451
Revert
2017-10-02  Richard Biener  

PR tree-optimization/82355
* graphite-isl-ast-to-gimple.c (build_iv_mapping): Also build
a mapping for the enclosing loop but avoid generating one for
the loop tree root.
(copy_bb_and_scalar_dependences): Remove premature codegen
error on PHIs in blocks duplicated into multiple places.
* graphite-scop-detection.c
(scop_detection::stmt_has_simple_data_refs_p): For a loop not
in the region use it as loop and nest to analyze the DR in.
(try_generate_gimple_bb): Likewise.
* graphite-sese-to-poly.c (extract_affine_chrec): Adjust.
(add_loop_constraints): For blocks in a loop not in the region
create a dimension with a single iteration.
* sese.h (gbb_loop_at_index): Remove assert.

* cfgloop.c (loop_preheader_edge): For the loop tree root
return the single successor of the entry block.
* graphite-isl-ast-to-gimple.c (graphite_regenerate_ast_isl):
Reset the SCEV hashtable and niters.
* graphite-scop-detection.c
(scop_detection::graphite_can_represent_scev): Add SCOP parameter,
assert that we only have POLYNOMIAL_CHREC that vary in loops
contained in the region.
(scop_detection::graphite_can_represent_expr): Adjust.
(scop_detection::stmt_has_simple_data_refs_p): For loops
not in the region set loop to NULL.  The nest is now the
entry edge to the region.
(try_generate_gimple_bb): Likewise.
* sese.c (scalar_evolution_in_region): Adjust for
instantiate_scev change.
* tree-data-ref.h (graphite_find_data_references_in_stmt):
Make nest parameter the edge into the region.
(create_data_ref): Likewise.
* tree-data-ref.c (dr_analyze_indices): Make nest parameter an
entry edge into a region and adjust instantiate_scev calls.
(create_data_ref): Likewise.
(graphite_find_data_references_in_stmt): Likewise.
(find_data_references_in_stmt): Pass the loop preheader 

[PATCH][GRAPHITE] Lift some IV restrictions

2017-10-12 Thread Richard Biener

The type check seems premature (we're checking CHRECs already) and
we certainly can handle POINTER IVs just fine.

Bootstrap / regtest running on x86_64-unknown-linux-gnu.

SPEC CPU 2k6 sees ~100 more loop nest optimizations that way.

Ok?

[I'd rather have problematical testcases for those weird
restrictions]

Thanks,
Richard.

2017-10-12  Richard Biener  

* graphite-scop-detection.c (loop_ivs_can_be_represented): Remove.
(scop_detection::harmful_loop_in_region): Remove premature
IV type restriction.
(scop_detection::graphite_can_represent_scev): We can handle
pointer IVs just fine.

Index: gcc/graphite-scop-detection.c
===
--- gcc/graphite-scop-detection.c   (revision 253676)
+++ gcc/graphite-scop-detection.c   (working copy)
@@ -254,28 +254,6 @@ dot_cfg ()
   scops.release ();
 }
 
-/* Can all ivs be represented by a signed integer?
-   As isl might generate negative values in its expressions, signed loop ivs
-   are required in the backend.  */
-
-static bool
-loop_ivs_can_be_represented (loop_p loop)
-{
-  unsigned type_long_long = TYPE_PRECISION (long_long_integer_type_node);
-  for (gphi_iterator psi = gsi_start_phis (loop->header); !gsi_end_p (psi);
-   gsi_next ())
-{
-  gphi *phi = psi.phi ();
-  tree res = PHI_RESULT (phi);
-  tree type = TREE_TYPE (res);
-
-  if (TYPE_UNSIGNED (type) && TYPE_PRECISION (type) >= type_long_long)
-   return false;
-}
-
-  return true;
-}
-
 /* Returns a COND_EXPR statement when BB has a single predecessor, the
edge between BB and its predecessor is not a loop exit edge, and
the last statement of the single predecessor is a COND_EXPR.  */
@@ -822,13 +800,6 @@ scop_detection::harmful_loop_in_region (
  return true;
}
 
-  if (! loop_ivs_can_be_represented (loop))
-   {
- DEBUG_PRINT (dp << "[scop-detection-fail] loop_" << loop->num
-  << "IV cannot be represented.\n");
- return true;
-   }
-
   /* Check if all loop nests have at least one data reference.
 ???  This check is expensive and loops premature at this point.
 If important to retain we can pre-compute this for all innermost
@@ -968,14 +939,6 @@ scop_detection::graphite_can_represent_s
   if (chrec_contains_undetermined (scev))
 return false;
 
-  /* We disable the handling of pointer types, because it’s currently not
- supported by Graphite with the isl AST generator. SSA_NAME nodes are
- the only nodes, which are disabled in case they are pointers to object
- types, but this can be changed.  */
-
-  if (POINTER_TYPE_P (TREE_TYPE (scev)) && TREE_CODE (scev) == SSA_NAME)
-return false;
-
   switch (TREE_CODE (scev))
 {
 case NEGATE_EXPR:

[PATCH][GRAPHITE] Fix PR69728 in "another" way

2017-10-12 Thread Richard Biener

I made scheduling to fail when we end up with an empty domain but as
I forgot to actually check the return value of build_original_schedule
the fix was equivalent to just doing nothing to the schedule when
it has an empty domain.  I verified that for the testcase it DCEs
the relevant stmt and that this is a valid transform.

Bootstrapped and tested on x86_64-unknown-linux-gnu, SPEC 2k6 is also
happy with this.

Committed as obvious (since no functional change).

Richard.

2017-10-12  Richard Biener  

PR tree-optimization/69728
Revert
2017-09-19  Richard Biener  

PR tree-optimization/69728
* graphite-sese-to-poly.c (schedule_error): New global.
(add_loop_schedule): Handle empty domain by failing the
schedule.
(build_original_schedule): Handle schedule_error.

* graphite-sese-to-poly.c (add_loop_schedule): Handle empty
domain by returning an unchanged schedule.

* gcc.dg/graphite/pr69728.c: Adjust to reflect we can handle
the loop now.  Remove unrelated undefined behavior.

Index: gcc/graphite-sese-to-poly.c
===
--- gcc/graphite-sese-to-poly.c (revision 253645)
+++ gcc/graphite-sese-to-poly.c (working copy)
@@ -1066,8 +1051,6 @@ outer_projection_mupa (__isl_take isl_un
   return isl_multi_union_pw_aff_from_union_pw_multi_aff (data.res);
 }
 
-static bool schedule_error;
-
 /* Embed SCHEDULE in the constraints of the LOOP domain.  */
 
 static isl_schedule *
@@ -1082,11 +1065,9 @@ add_loop_schedule (__isl_take isl_schedu
 return empty < 0 ? isl_schedule_free (schedule) : schedule;
 
   isl_union_set *domain = isl_schedule_get_domain (schedule);
-  /* We cannot apply an empty domain to pbbs in this loop so fail.
- ??? Somehow drop pbbs in the loop instead.  */
+  /* We cannot apply an empty domain to pbbs in this loop so return early.  */
   if (isl_union_set_is_empty (domain))
 {
-  schedule_error = true;
   isl_union_set_free (domain);
   return schedule;
 }
@@ -1216,8 +1197,6 @@ build_schedule_loop_nest (scop_p scop, i
 static bool
 build_original_schedule (scop_p scop)
 {
-  schedule_error = false;
-
   int i = 0;
   int n = scop->pbbs.length ();
   while (i < n)
@@ -1232,14 +1211,6 @@ build_original_schedule (scop_p scop)
   scop->original_schedule = add_in_sequence (scop->original_schedule, s);
 }
 
-  if (schedule_error)
-{
-  if (dump_file)
-   fprintf (dump_file, "[sese-to-poly] failed to build "
-"original schedule\n");
-  return false;
-}
-
   if (dump_file)
 {
   fprintf (dump_file, "[sese-to-poly] original schedule:\n");
Index: gcc/testsuite/gcc.dg/graphite/pr69728.c
===
--- gcc/testsuite/gcc.dg/graphite/pr69728.c (revision 253645)
+++ gcc/testsuite/gcc.dg/graphite/pr69728.c (working copy)
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
-/* { dg-options "-O3 -floop-nest-optimize" } */
+/* { dg-options "-O3 -floop-nest-optimize -fdump-tree-graphite-details" } */
 
-int a[1];
+int a[9];
 int b, c, d, e;
 void
 fn1 ()
@@ -19,3 +19,9 @@ fn1 ()
}
 }
 }
+
+/* At the moment only ISL figures that if (d) is always true.  We've
+   run into scheduling issues before here, not being able to handle
+   empty domains.  */
+
+/* { dg-final { scan-tree-dump "loop nest optimized" "graphite" } }  */


Re: [PATCH][GRAPHITE] Fix PR82451 (and PR82355 in a different way)

2017-10-12 Thread Bin.Cheng
On Thu, Oct 12, 2017 at 12:13 PM, Richard Biener  wrote:
> On Thu, 12 Oct 2017, Bin.Cheng wrote:
>
>> On Wed, Oct 11, 2017 at 3:43 PM, Richard Biener  wrote:
>> >
>> > For PR82355 I introduced a fake dimension to ISL to allow CHRECs
>> > having an evolution in a loop that isn't fully part of the SESE
>> > region we are processing.  That was easier than fending off those
>> > CHRECs (without simply giving up on SESE regions with those).
>> >
>> > But it didn't fully solve the issue as PR82451 shows where we run
>> > into the issue that we eventually have to code-gen those
>> > evolutions and thus in theory need a canonical IV of that containing loop.
>> >
>> > So I decided (after Micha pressuring me a bit...) to revisit the
>> > original issue and make SCEV analysis "properly" handle SE regions.
>> > It turns out that it is mostly instantiate_scev lacking proper support
>> > plus the necessary interfacing change (really just cosmetic in some sense)
>> > from a instantiate_before basic-block to a instantiate_before edge.
>> >
>> > data-ref interfaces have been similarly adjusted, here changing
>> > the "loop nest" loop parameter to the entry edge for the SE region
>> > and passing that down accordingly.
>> >
>> > I've for now tried to keep other high-level loop-based interfaces the
>> > same by simply using the loop preheader edge as entry where appropriate
>> > (needing loop_preheader_edge cope with the loop root tree for simplicity).
>> >
>> > In the process I ran into issues with us too overly aggressive
>> > instantiating random trees and thus I cut those down.  That part
>> > doesn't successfully test separately (when I remove the strange
>> > ARRAY_REF instantiation), so it's part of this patch.  I've also
>> > run into an SSA verification fail (the id-27.f90 testcase) which
>> > shows we _do_ need to clear the SCEV cache after introducing
>> > the versioned CFG (and added a comment before it).
>> >
>> > On the previously failing testcases I've verified we produce
>> > sensible instantiations for those pesky refs residing in "no" loop
>> > in the SCOP and that we get away with the result in terms of
>> > optimizing.
>> >
>> > SPEC 2k6 testing shows
>> >
>> > loop nest optimized: 311
>> > loop nest not optimized, code generation error: 0
>> > loop nest not optimized, optimized schedule is identical to original
>> > schedule: 173
>> > loop nest not optimized, optimization timed out: 59
>> > loop nest not optimized, ISL signalled an error: 9
>> > loop nest: 552
>> >
>> > for SPEC 2k6 and -floop-nest-optimize while adding -fgraphite-identity
>> > still reveals some codegen errors:
>> >
>> > loop nest optimized: 437
>> > loop nest not optimized, code generation error: 25
>> > loop nest not optimized, optimized schedule is identical to original
>> > schedule: 169
>> > loop nest not optimized, optimization timed out: 60
>> > loop nest not optimized, ISL signalled an error: 9
>> > loop nest: 700
>> >
>> > Bootstrap and regtest in progress on x86_64-unknown-linux-gnu
>> > (with and without -fgraphite-identity -floop-nest-optimize).
>> >
>> > Ok?
>> >
>> > Thanks,
>> > Richard.
>> >
>>
>> > Index: gcc/tree-scalar-evolution.c
>> > ===
>> > --- gcc/tree-scalar-evolution.c (revision 253645)
>> > +++ gcc/tree-scalar-evolution.c (working copy)
>> > @@ -2344,7 +2348,7 @@ static tree instantiate_scev_r (basic_bl
>> > instantiated, and to stop if it exceeds some limit.  */
>> >
>> >  static tree
>> > -instantiate_scev_name (basic_block instantiate_below,
>> > +instantiate_scev_name (edge instantiate_below,
>> >struct loop *evolution_loop, struct loop 
>> > *inner_loop,
>> >tree chrec,
>> >bool *fold_conversions,
>> > @@ -2358,7 +2362,7 @@ instantiate_scev_name (basic_block insta
>> >   evolutions in outer loops), nothing to do.  */
>> >if (!def_bb
>> >|| loop_depth (def_bb->loop_father) == 0
>> > -  || dominated_by_p (CDI_DOMINATORS, instantiate_below, def_bb))
>> > +  || ! dominated_by_p (CDI_DOMINATORS, def_bb, 
>> > instantiate_below->dest))
>> >  return chrec;
>> >
>> >/* We cache the value of instantiated variable to avoid exponential
>> > @@ -2380,6 +2384,51 @@ instantiate_scev_name (basic_block insta
>> >
>> >def_loop = find_common_loop (evolution_loop, def_bb->loop_father);
>> >
>> > +  if (! dominated_by_p (CDI_DOMINATORS,
>> > +   def_loop->header, instantiate_below->dest))
>> > +{
>> > +  gimple *def = SSA_NAME_DEF_STMT (chrec);
>> > +  if (gassign *ass = dyn_cast  (def))
>> > +   {
>> > + switch (gimple_assign_rhs_class (ass))
>> > +   {
>> > +   case GIMPLE_UNARY_RHS:
>> > + {
>> > +   tree op0 = instantiate_scev_r (instantiate_below, 
>> > evolution_loop,
>> > +  

Re: [PATCH][GRAPHITE] Fix PR82451 (and PR82355 in a different way)

2017-10-12 Thread Richard Biener
On Thu, 12 Oct 2017, Bin.Cheng wrote:

> On Wed, Oct 11, 2017 at 3:43 PM, Richard Biener  wrote:
> >
> > For PR82355 I introduced a fake dimension to ISL to allow CHRECs
> > having an evolution in a loop that isn't fully part of the SESE
> > region we are processing.  That was easier than fending off those
> > CHRECs (without simply giving up on SESE regions with those).
> >
> > But it didn't fully solve the issue as PR82451 shows where we run
> > into the issue that we eventually have to code-gen those
> > evolutions and thus in theory need a canonical IV of that containing loop.
> >
> > So I decided (after Micha pressuring me a bit...) to revisit the
> > original issue and make SCEV analysis "properly" handle SE regions.
> > It turns out that it is mostly instantiate_scev lacking proper support
> > plus the necessary interfacing change (really just cosmetic in some sense)
> > from a instantiate_before basic-block to a instantiate_before edge.
> >
> > data-ref interfaces have been similarly adjusted, here changing
> > the "loop nest" loop parameter to the entry edge for the SE region
> > and passing that down accordingly.
> >
> > I've for now tried to keep other high-level loop-based interfaces the
> > same by simply using the loop preheader edge as entry where appropriate
> > (needing loop_preheader_edge cope with the loop root tree for simplicity).
> >
> > In the process I ran into issues with us too overly aggressive
> > instantiating random trees and thus I cut those down.  That part
> > doesn't successfully test separately (when I remove the strange
> > ARRAY_REF instantiation), so it's part of this patch.  I've also
> > run into an SSA verification fail (the id-27.f90 testcase) which
> > shows we _do_ need to clear the SCEV cache after introducing
> > the versioned CFG (and added a comment before it).
> >
> > On the previously failing testcases I've verified we produce
> > sensible instantiations for those pesky refs residing in "no" loop
> > in the SCOP and that we get away with the result in terms of
> > optimizing.
> >
> > SPEC 2k6 testing shows
> >
> > loop nest optimized: 311
> > loop nest not optimized, code generation error: 0
> > loop nest not optimized, optimized schedule is identical to original
> > schedule: 173
> > loop nest not optimized, optimization timed out: 59
> > loop nest not optimized, ISL signalled an error: 9
> > loop nest: 552
> >
> > for SPEC 2k6 and -floop-nest-optimize while adding -fgraphite-identity
> > still reveals some codegen errors:
> >
> > loop nest optimized: 437
> > loop nest not optimized, code generation error: 25
> > loop nest not optimized, optimized schedule is identical to original
> > schedule: 169
> > loop nest not optimized, optimization timed out: 60
> > loop nest not optimized, ISL signalled an error: 9
> > loop nest: 700
> >
> > Bootstrap and regtest in progress on x86_64-unknown-linux-gnu
> > (with and without -fgraphite-identity -floop-nest-optimize).
> >
> > Ok?
> >
> > Thanks,
> > Richard.
> >
> 
> > Index: gcc/tree-scalar-evolution.c
> > ===
> > --- gcc/tree-scalar-evolution.c (revision 253645)
> > +++ gcc/tree-scalar-evolution.c (working copy)
> > @@ -2344,7 +2348,7 @@ static tree instantiate_scev_r (basic_bl
> > instantiated, and to stop if it exceeds some limit.  */
> >
> >  static tree
> > -instantiate_scev_name (basic_block instantiate_below,
> > +instantiate_scev_name (edge instantiate_below,
> >struct loop *evolution_loop, struct loop *inner_loop,
> >tree chrec,
> >bool *fold_conversions,
> > @@ -2358,7 +2362,7 @@ instantiate_scev_name (basic_block insta
> >   evolutions in outer loops), nothing to do.  */
> >if (!def_bb
> >|| loop_depth (def_bb->loop_father) == 0
> > -  || dominated_by_p (CDI_DOMINATORS, instantiate_below, def_bb))
> > +  || ! dominated_by_p (CDI_DOMINATORS, def_bb, 
> > instantiate_below->dest))
> >  return chrec;
> >
> >/* We cache the value of instantiated variable to avoid exponential
> > @@ -2380,6 +2384,51 @@ instantiate_scev_name (basic_block insta
> >
> >def_loop = find_common_loop (evolution_loop, def_bb->loop_father);
> >
> > +  if (! dominated_by_p (CDI_DOMINATORS,
> > +   def_loop->header, instantiate_below->dest))
> > +{
> > +  gimple *def = SSA_NAME_DEF_STMT (chrec);
> > +  if (gassign *ass = dyn_cast  (def))
> > +   {
> > + switch (gimple_assign_rhs_class (ass))
> > +   {
> > +   case GIMPLE_UNARY_RHS:
> > + {
> > +   tree op0 = instantiate_scev_r (instantiate_below, 
> > evolution_loop,
> > +  inner_loop, 
> > gimple_assign_rhs1 (ass),
> > +  fold_conversions, size_expr);
> > +   if (op0 == chrec_dont_know)
> > + 

Re: [PATCH][GRAPHITE] Fix PR82451 (and PR82355 in a different way)

2017-10-12 Thread Bin.Cheng
On Wed, Oct 11, 2017 at 3:43 PM, Richard Biener  wrote:
>
> For PR82355 I introduced a fake dimension to ISL to allow CHRECs
> having an evolution in a loop that isn't fully part of the SESE
> region we are processing.  That was easier than fending off those
> CHRECs (without simply giving up on SESE regions with those).
>
> But it didn't fully solve the issue as PR82451 shows where we run
> into the issue that we eventually have to code-gen those
> evolutions and thus in theory need a canonical IV of that containing loop.
>
> So I decided (after Micha pressuring me a bit...) to revisit the
> original issue and make SCEV analysis "properly" handle SE regions.
> It turns out that it is mostly instantiate_scev lacking proper support
> plus the necessary interfacing change (really just cosmetic in some sense)
> from a instantiate_before basic-block to a instantiate_before edge.
>
> data-ref interfaces have been similarly adjusted, here changing
> the "loop nest" loop parameter to the entry edge for the SE region
> and passing that down accordingly.
>
> I've for now tried to keep other high-level loop-based interfaces the
> same by simply using the loop preheader edge as entry where appropriate
> (needing loop_preheader_edge cope with the loop root tree for simplicity).
>
> In the process I ran into issues with us too overly aggressive
> instantiating random trees and thus I cut those down.  That part
> doesn't successfully test separately (when I remove the strange
> ARRAY_REF instantiation), so it's part of this patch.  I've also
> run into an SSA verification fail (the id-27.f90 testcase) which
> shows we _do_ need to clear the SCEV cache after introducing
> the versioned CFG (and added a comment before it).
>
> On the previously failing testcases I've verified we produce
> sensible instantiations for those pesky refs residing in "no" loop
> in the SCOP and that we get away with the result in terms of
> optimizing.
>
> SPEC 2k6 testing shows
>
> loop nest optimized: 311
> loop nest not optimized, code generation error: 0
> loop nest not optimized, optimized schedule is identical to original
> schedule: 173
> loop nest not optimized, optimization timed out: 59
> loop nest not optimized, ISL signalled an error: 9
> loop nest: 552
>
> for SPEC 2k6 and -floop-nest-optimize while adding -fgraphite-identity
> still reveals some codegen errors:
>
> loop nest optimized: 437
> loop nest not optimized, code generation error: 25
> loop nest not optimized, optimized schedule is identical to original
> schedule: 169
> loop nest not optimized, optimization timed out: 60
> loop nest not optimized, ISL signalled an error: 9
> loop nest: 700
>
> Bootstrap and regtest in progress on x86_64-unknown-linux-gnu
> (with and without -fgraphite-identity -floop-nest-optimize).
>
> Ok?
>
> Thanks,
> Richard.
>

> Index: gcc/tree-scalar-evolution.c
> ===
> --- gcc/tree-scalar-evolution.c (revision 253645)
> +++ gcc/tree-scalar-evolution.c (working copy)
> @@ -2344,7 +2348,7 @@ static tree instantiate_scev_r (basic_bl
> instantiated, and to stop if it exceeds some limit.  */
>
>  static tree
> -instantiate_scev_name (basic_block instantiate_below,
> +instantiate_scev_name (edge instantiate_below,
>struct loop *evolution_loop, struct loop *inner_loop,
>tree chrec,
>bool *fold_conversions,
> @@ -2358,7 +2362,7 @@ instantiate_scev_name (basic_block insta
>   evolutions in outer loops), nothing to do.  */
>if (!def_bb
>|| loop_depth (def_bb->loop_father) == 0
> -  || dominated_by_p (CDI_DOMINATORS, instantiate_below, def_bb))
> +  || ! dominated_by_p (CDI_DOMINATORS, def_bb, instantiate_below->dest))
>  return chrec;
>
>/* We cache the value of instantiated variable to avoid exponential
> @@ -2380,6 +2384,51 @@ instantiate_scev_name (basic_block insta
>
>def_loop = find_common_loop (evolution_loop, def_bb->loop_father);
>
> +  if (! dominated_by_p (CDI_DOMINATORS,
> +   def_loop->header, instantiate_below->dest))
> +{
> +  gimple *def = SSA_NAME_DEF_STMT (chrec);
> +  if (gassign *ass = dyn_cast  (def))
> +   {
> + switch (gimple_assign_rhs_class (ass))
> +   {
> +   case GIMPLE_UNARY_RHS:
> + {
> +   tree op0 = instantiate_scev_r (instantiate_below, 
> evolution_loop,
> +  inner_loop, gimple_assign_rhs1 
> (ass),
> +  fold_conversions, size_expr);
> +   if (op0 == chrec_dont_know)
> + return chrec_dont_know;
> +   res = fold_build1 (gimple_assign_rhs_code (ass),
> +  TREE_TYPE (chrec), op0);
> +   break;
> + }
> +   case GIMPLE_BINARY_RHS:
> + {
> +   

[PATCH][GRAPHITE] Fix PR82525

2017-10-12 Thread Richard Biener

The following avoids code-generation errors for modulo operations 
resulting from our own constraints ending up as no-ops because
the type we code-generate in already imposes the modulo operation.

For the case in SPEC 2k6 triggering this we'd even know the
modulo constraint isn't necessary - we have

 int64_t lower, upper;
 if (lower < upper)
   uint64_t niter = (uint64_t)upper - (uint64_t)lower;

but there's no way to represent in GIMPLE that subtracting
the two signed values will yield a positive value fitting
in the corresponding unsigned type...  We'd need sth
like a MINUSU_EXPR (like the often proposed ABSU_EXPR or
the proposed POINTER_DIFF_EXPR).

This fixes the last code generation errors with SPEC CPU 2006
and -fgraphite-identity -floop-nest-optimize.

loop nest optimized: 483
loop nest not optimized, code generation error: 0
loop nest not optimized, optimized schedule is identical to original 
schedule: 173
loop nest not optimized, optimization timed out: 60
loop nest not optimized, ISL signalled an error: 9
loop nest: 725

Note that we still (with and without this patch) get miscompares
in 465.tonto, 416.gamess and 403.gcc (we have those "wrong"
constraint thing leading to empty domains if you remember).

Bootstrap and regtest running on x86_64-unknown-linux-gnu, ok?

Thanks,
Richard.

2017-10-12  Richard Biener  

PR tree-optimization/82525
* graphite-isl-ast-to-gimple.c
(translate_isl_ast_to_gimple::widest_int_from_isl_expr_int): Split
out from ...
(translate_isl_ast_to_gimple::gcc_expression_from_isl_expr_int): Here.
Fail code generation when we cannot represent the isl integer.
(binary_op_to_tree): Elide modulo operations that are no-ops
in the type we code generate.  Remove now superfluous code
generation errors.

* gcc.dg/graphite/id-30.c: New testcase.
* gfortran.dg/graphite/id-28.f90: Likewise.

Index: gcc/graphite-isl-ast-to-gimple.c
===
--- gcc/graphite-isl-ast-to-gimple.c(revision 253645)
+++ gcc/graphite-isl-ast-to-gimple.c(working copy)
@@ -177,6 +177,7 @@ class translate_isl_ast_to_gimple
   tree gcc_expression_from_isl_ast_expr_id (tree type,
__isl_keep isl_ast_expr *expr_id,
ivs_params );
+  widest_int widest_int_from_isl_expr_int (__isl_keep isl_ast_expr *expr);
   tree gcc_expression_from_isl_expr_int (tree type,
 __isl_take isl_ast_expr *expr);
   tree gcc_expression_from_isl_expr_op (tree type,
@@ -265,29 +266,46 @@ gcc_expression_from_isl_ast_expr_id (tre
   return fold_convert (type, *val);
 }
 
-/* Converts an isl_ast_expr_int expression E to a GCC expression tree of
-   type TYPE.  */
+/* Converts an isl_ast_expr_int expression E to a widest_int.
+   Raises a code generation error when the constant doesn't fit.  */
 
-tree translate_isl_ast_to_gimple::
-gcc_expression_from_isl_expr_int (tree type, __isl_take isl_ast_expr *expr)
+widest_int translate_isl_ast_to_gimple::
+widest_int_from_isl_expr_int (__isl_keep isl_ast_expr *expr)
 {
   gcc_assert (isl_ast_expr_get_type (expr) == isl_ast_expr_int);
   isl_val *val = isl_ast_expr_get_val (expr);
   size_t n = isl_val_n_abs_num_chunks (val, sizeof (HOST_WIDE_INT));
   HOST_WIDE_INT *chunks = XALLOCAVEC (HOST_WIDE_INT, n);
-  tree res;
-  if (isl_val_get_abs_num_chunks (val, sizeof (HOST_WIDE_INT), chunks) == -1)
-res = NULL_TREE;
-  else
+  if (n > WIDE_INT_MAX_ELTS
+  || isl_val_get_abs_num_chunks (val, sizeof (HOST_WIDE_INT), chunks) == 
-1)
 {
-  widest_int wi = widest_int::from_array (chunks, n, true);
-  if (isl_val_is_neg (val))
-   wi = -wi;
-  res = wide_int_to_tree (type, wi);
+  isl_val_free (val);
+  set_codegen_error ();
+  return 0;
 }
+  widest_int wi = widest_int::from_array (chunks, n, true);
+  if (isl_val_is_neg (val))
+wi = -wi;
   isl_val_free (val);
+  return wi;
+}
+
+/* Converts an isl_ast_expr_int expression E to a GCC expression tree of
+   type TYPE.  Raises a code generation error when the constant doesn't fit.  
*/
+
+tree translate_isl_ast_to_gimple::
+gcc_expression_from_isl_expr_int (tree type, __isl_take isl_ast_expr *expr)
+{
+  widest_int wi = widest_int_from_isl_expr_int (expr);
   isl_ast_expr_free (expr);
-  return res;
+  if (codegen_error_p ())
+return NULL_TREE;
+  if (wi::min_precision (wi, TYPE_SIGN (type)) > TYPE_PRECISION (type))
+{
+  set_codegen_error ();
+  return NULL_TREE;
+}
+  return wide_int_to_tree (type, wi);
 }
 
 /* Converts a binary isl_ast_expr_op expression E to a GCC expression tree of
@@ -296,14 +314,25 @@ gcc_expression_from_isl_expr_int (tree t
 tree translate_isl_ast_to_gimple::
 binary_op_to_tree (tree type, __isl_take isl_ast_expr *expr, ivs_params )
 {
+  enum isl_ast_op_type expr_type = 

[PATCH][GRAPHITE] Fix PR82451 (and PR82355 in a different way)

2017-10-11 Thread Richard Biener

For PR82355 I introduced a fake dimension to ISL to allow CHRECs
having an evolution in a loop that isn't fully part of the SESE
region we are processing.  That was easier than fending off those
CHRECs (without simply giving up on SESE regions with those).

But it didn't fully solve the issue as PR82451 shows where we run
into the issue that we eventually have to code-gen those
evolutions and thus in theory need a canonical IV of that containing loop.

So I decided (after Micha pressuring me a bit...) to revisit the
original issue and make SCEV analysis "properly" handle SE regions.
It turns out that it is mostly instantiate_scev lacking proper support
plus the necessary interfacing change (really just cosmetic in some sense)
from a instantiate_before basic-block to a instantiate_before edge.

data-ref interfaces have been similarly adjusted, here changing
the "loop nest" loop parameter to the entry edge for the SE region
and passing that down accordingly.

I've for now tried to keep other high-level loop-based interfaces the
same by simply using the loop preheader edge as entry where appropriate
(needing loop_preheader_edge cope with the loop root tree for simplicity).

In the process I ran into issues with us too overly aggressive
instantiating random trees and thus I cut those down.  That part
doesn't successfully test separately (when I remove the strange
ARRAY_REF instantiation), so it's part of this patch.  I've also
run into an SSA verification fail (the id-27.f90 testcase) which
shows we _do_ need to clear the SCEV cache after introducing
the versioned CFG (and added a comment before it).

On the previously failing testcases I've verified we produce
sensible instantiations for those pesky refs residing in "no" loop
in the SCOP and that we get away with the result in terms of
optimizing.

SPEC 2k6 testing shows

loop nest optimized: 311
loop nest not optimized, code generation error: 0
loop nest not optimized, optimized schedule is identical to original 
schedule: 173
loop nest not optimized, optimization timed out: 59
loop nest not optimized, ISL signalled an error: 9
loop nest: 552

for SPEC 2k6 and -floop-nest-optimize while adding -fgraphite-identity
still reveals some codegen errors:

loop nest optimized: 437
loop nest not optimized, code generation error: 25
loop nest not optimized, optimized schedule is identical to original 
schedule: 169
loop nest not optimized, optimization timed out: 60
loop nest not optimized, ISL signalled an error: 9
loop nest: 700

Bootstrap and regtest in progress on x86_64-unknown-linux-gnu
(with and without -fgraphite-identity -floop-nest-optimize).

Ok?

Thanks,
Richard.

2017-10-11  Richard Biener  

PR tree-optimization/82451
Revert
2017-10-02  Richard Biener  

PR tree-optimization/82355
* graphite-isl-ast-to-gimple.c (build_iv_mapping): Also build
a mapping for the enclosing loop but avoid generating one for
the loop tree root.
(copy_bb_and_scalar_dependences): Remove premature codegen
error on PHIs in blocks duplicated into multiple places.
* graphite-scop-detection.c
(scop_detection::stmt_has_simple_data_refs_p): For a loop not
in the region use it as loop and nest to analyze the DR in.
(try_generate_gimple_bb): Likewise.
* graphite-sese-to-poly.c (extract_affine_chrec): Adjust.
(add_loop_constraints): For blocks in a loop not in the region
create a dimension with a single iteration.
* sese.h (gbb_loop_at_index): Remove assert.

* cfgloop.c (loop_preheader_edge): For the loop tree root
return the single successor of the entry block.
* graphite-isl-ast-to-gimple.c (graphite_regenerate_ast_isl):
Reset the SCEV hashtable and niters.
* graphite-scop-detection.c
(scop_detection::graphite_can_represent_scev): Add SCOP parameter,
assert that we only have POLYNOMIAL_CHREC that vary in loops
contained in the region.
(scop_detection::graphite_can_represent_expr): Adjust.
(scop_detection::stmt_has_simple_data_refs_p): For loops
not in the region set loop to NULL.  The nest is now the
entry edge to the region.
(try_generate_gimple_bb): Likewise.
* sese.c (scalar_evolution_in_region): Adjust for
instantiate_scev change.
* tree-data-ref.h (graphite_find_data_references_in_stmt):
Make nest parameter the edge into the region.
(create_data_ref): Likewise.
* tree-data-ref.c (dr_analyze_indices): Make nest parameter an
entry edge into a region and adjust instantiate_scev calls.
(create_data_ref): Likewise.
(graphite_find_data_references_in_stmt): Likewise.
(find_data_references_in_stmt): Pass the loop preheader edge
from the nest argument.
* tree-scalar-evolution.h (instantiate_scev): Make instantiate_below
 

Re: [PATCH][GRAPHITE] Fix PR82449

2017-10-11 Thread Sebastian Pop
On Oct 9, 2017 8:48 AM, "Richard Biener"  wrote:

On Mon, 9 Oct 2017, Richard Biener wrote:

> On Fri, 6 Oct 2017, Sebastian Pop wrote:
>
> > On Fri, Oct 6, 2017 at 8:33 AM, Richard Biener 
wrote:
> >
> > > On Fri, 6 Oct 2017, Sebastian Pop wrote:
> > >
> > > > On Fri, Oct 6, 2017 at 6:56 AM, Richard Biener 
> > > wrote:
> > > >
> > > > >
> > > > > The following fences off a few more SCEVs through
scev_analyzable_p
> > > given
> > > > > at the end we need those pass chrec_apply when getting a rename
through
> > > > > SCEV.
> > > > >
> > > > > The SCEV in question is
> > > > >
> > > > >   {(integer(kind=4)) {0, +, {1, +, 1}_1}_1, + 1}_2
> > > > >
> > > > > which fails to chrec_apply in the CHREC_LEFT part because that
part
> > > > > is not affine (and we're usually not replacing a IV with a
constant
> > > > > where chrec_apply might handle one or the other case).
> > > > >
> > > > > Bootstrapped and tested on x86_64-unknown-linux-gnu.
> > > > >
> > > > > This fixes three out of the remaining 8 codegen errors in SPEC CPU
> > > 2006.
> > > > >
> > > > > Ok?
> > > > >
> > > > > Thanks,
> > > > > Richard.
> > > > >
> > > > > 2017-10-06  Richard Biener  
> > > > >
> > > > > PR tree-optimization/82449
> > > > > * sese.c (can_chrec_apply): New function.
> > > > > (scev_analyzable_p): Check we can call chrec_apply on the
SCEV.
> > > > >
> > > > > * gfortran.dg/graphite/pr82449.f: New testcase.
> > >
> > > >
> > > > > Index: gcc/sese.c
> > > > > 
===
> > > > > --- gcc/sese.c  (revision 253477)
> > > > > +++ gcc/sese.c  (working copy)
> > > > > @@ -421,6 +421,27 @@ invariant_in_sese_p_rec (tree t, const s
> > > > >return true;
> > > > >  }
> > > > >
> > > > > +/* Check whether we can call chrec_apply on CHREC with arbitrary
X and
> > > > > VAR.  */
> > >
> > > > +
> > > > > +static bool
> > > > > +can_chrec_apply (tree chrec)
> > >
> >
> > Could we use scev_is_linear_expression ?
> > It seems like can_chrec_apply has the same semantics.
>
> Looks like that works.
>
> >
> > > > > +{
> > > > > +  if (automatically_generated_chrec_p (chrec))
> > > > > +return false;
> > > > > +  switch (TREE_CODE (chrec))
> > > > > +{
> > > > > +case POLYNOMIAL_CHREC:
> > > > > +  if (evolution_function_is_affine_p (chrec))
> > > > > +   return (can_chrec_apply (CHREC_LEFT (chrec))
> > > > > +   && can_chrec_apply (CHREC_RIGHT (chrec)));
> > > > > +  return false;
> > > > > +CASE_CONVERT:
> > > > > +  return can_chrec_apply (TREE_OPERAND (chrec, 0));
> > > > > +default:;
> > > > > +  return tree_does_not_contain_chrecs (chrec);
> > > > > +}
> > > > > +}
> > > > > +
> > > > >  /* Return true when DEF can be analyzed in REGION by the scalar
> > > > > evolution analyzer.  */
> > > > >
> > > > > @@ -449,6 +470,7 @@ scev_analyzable_p (tree def, sese_l 
> > > > > || !defined_in_sese_p (scev, region))
> > > > >  && (tree_does_not_contain_chrecs (scev)
> > > > > || evolution_function_is_affine_p (scev))
> > > > >
> > > >
> > > > Why isn't evolution_function_is_affine_p returning false on {0, +,
{1, +,
> > > > 1}_1}_1?
> > > > This is quadratic.
> > >
> > > It returns false on that but the CHREC we ask it on is
> > >
> > > {(integer(kind=4)) {0, +, {1, +, 1}_1}_1, + 1}_2
> > >
> > > only the initial value is "quadratic".
> > >
> >
> > Right.
> > If I understand correctly, the scop is the body of loop_1,
> > and we do not need to represent the quadratic evolution
> > of the initial value.
>
> Giving the following full testing now.

And the following is what I have applied after bootstrap / testing
on x86_64-unknown-linux-gnu.

As you can see I needed some adjustments to not reject otherwise
valid SCEVs with address constants.

Richard.

2017-10-09  Richard Biener  

PR tree-optimization/82449
* sese.c (scev_analyzable_p): Check whether the SCEV is linear.
* tree-chrec.h (evolution_function_is_constant_p): Adjust to
allow constant addresses.
* tree-chrec.c (scev_is_linear_expression): Constant evolutions
are linear.

* gfortran.dg/graphite/pr82449.f: New testcase.


Looks good to me.

Thanks.


Index: gcc/sese.c
===
--- gcc/sese.c  (revision 253486)
+++ gcc/sese.c  (working copy)
@@ -444,14 +444,13 @@ scev_analyzable_p (tree def, sese_l 
   loop = loop_containing_stmt (SSA_NAME_DEF_STMT (def));
   scev = scalar_evolution_in_region (region, loop, def);

-  return !chrec_contains_undetermined (scev)
-&& (TREE_CODE (scev) != SSA_NAME
-   || !defined_in_sese_p (scev, region))
-&& (tree_does_not_contain_chrecs (scev)
-   || evolution_function_is_affine_p (scev))
-&& (! loop
-   || ! loop_in_sese_p (loop, region)
-   || ! 

Re: [PATCH][GRAPHITE] Fix PR82449

2017-10-09 Thread Richard Biener
On Mon, 9 Oct 2017, Richard Biener wrote:

> On Fri, 6 Oct 2017, Sebastian Pop wrote:
> 
> > On Fri, Oct 6, 2017 at 8:33 AM, Richard Biener  wrote:
> > 
> > > On Fri, 6 Oct 2017, Sebastian Pop wrote:
> > >
> > > > On Fri, Oct 6, 2017 at 6:56 AM, Richard Biener 
> > > wrote:
> > > >
> > > > >
> > > > > The following fences off a few more SCEVs through scev_analyzable_p
> > > given
> > > > > at the end we need those pass chrec_apply when getting a rename 
> > > > > through
> > > > > SCEV.
> > > > >
> > > > > The SCEV in question is
> > > > >
> > > > >   {(integer(kind=4)) {0, +, {1, +, 1}_1}_1, + 1}_2
> > > > >
> > > > > which fails to chrec_apply in the CHREC_LEFT part because that part
> > > > > is not affine (and we're usually not replacing a IV with a constant
> > > > > where chrec_apply might handle one or the other case).
> > > > >
> > > > > Bootstrapped and tested on x86_64-unknown-linux-gnu.
> > > > >
> > > > > This fixes three out of the remaining 8 codegen errors in SPEC CPU
> > > 2006.
> > > > >
> > > > > Ok?
> > > > >
> > > > > Thanks,
> > > > > Richard.
> > > > >
> > > > > 2017-10-06  Richard Biener  
> > > > >
> > > > > PR tree-optimization/82449
> > > > > * sese.c (can_chrec_apply): New function.
> > > > > (scev_analyzable_p): Check we can call chrec_apply on the 
> > > > > SCEV.
> > > > >
> > > > > * gfortran.dg/graphite/pr82449.f: New testcase.
> > >
> > > >
> > > > > Index: gcc/sese.c
> > > > > ===
> > > > > --- gcc/sese.c  (revision 253477)
> > > > > +++ gcc/sese.c  (working copy)
> > > > > @@ -421,6 +421,27 @@ invariant_in_sese_p_rec (tree t, const s
> > > > >return true;
> > > > >  }
> > > > >
> > > > > +/* Check whether we can call chrec_apply on CHREC with arbitrary X 
> > > > > and
> > > > > VAR.  */
> > >
> > > > +
> > > > > +static bool
> > > > > +can_chrec_apply (tree chrec)
> > >
> > 
> > Could we use scev_is_linear_expression ?
> > It seems like can_chrec_apply has the same semantics.
> 
> Looks like that works.
> 
> > 
> > > > > +{
> > > > > +  if (automatically_generated_chrec_p (chrec))
> > > > > +return false;
> > > > > +  switch (TREE_CODE (chrec))
> > > > > +{
> > > > > +case POLYNOMIAL_CHREC:
> > > > > +  if (evolution_function_is_affine_p (chrec))
> > > > > +   return (can_chrec_apply (CHREC_LEFT (chrec))
> > > > > +   && can_chrec_apply (CHREC_RIGHT (chrec)));
> > > > > +  return false;
> > > > > +CASE_CONVERT:
> > > > > +  return can_chrec_apply (TREE_OPERAND (chrec, 0));
> > > > > +default:;
> > > > > +  return tree_does_not_contain_chrecs (chrec);
> > > > > +}
> > > > > +}
> > > > > +
> > > > >  /* Return true when DEF can be analyzed in REGION by the scalar
> > > > > evolution analyzer.  */
> > > > >
> > > > > @@ -449,6 +470,7 @@ scev_analyzable_p (tree def, sese_l 
> > > > > || !defined_in_sese_p (scev, region))
> > > > >  && (tree_does_not_contain_chrecs (scev)
> > > > > || evolution_function_is_affine_p (scev))
> > > > >
> > > >
> > > > Why isn't evolution_function_is_affine_p returning false on {0, +, {1, 
> > > > +,
> > > > 1}_1}_1?
> > > > This is quadratic.
> > >
> > > It returns false on that but the CHREC we ask it on is
> > >
> > > {(integer(kind=4)) {0, +, {1, +, 1}_1}_1, + 1}_2
> > >
> > > only the initial value is "quadratic".
> > >
> > 
> > Right.
> > If I understand correctly, the scop is the body of loop_1,
> > and we do not need to represent the quadratic evolution
> > of the initial value.
> 
> Giving the following full testing now.

And the following is what I have applied after bootstrap / testing
on x86_64-unknown-linux-gnu.

As you can see I needed some adjustments to not reject otherwise
valid SCEVs with address constants.

Richard.

2017-10-09  Richard Biener  

PR tree-optimization/82449
* sese.c (scev_analyzable_p): Check whether the SCEV is linear.
* tree-chrec.h (evolution_function_is_constant_p): Adjust to
allow constant addresses.
* tree-chrec.c (scev_is_linear_expression): Constant evolutions
are linear.

* gfortran.dg/graphite/pr82449.f: New testcase.

Index: gcc/sese.c
===
--- gcc/sese.c  (revision 253486)
+++ gcc/sese.c  (working copy)
@@ -444,14 +444,13 @@ scev_analyzable_p (tree def, sese_l 
   loop = loop_containing_stmt (SSA_NAME_DEF_STMT (def));
   scev = scalar_evolution_in_region (region, loop, def);
 
-  return !chrec_contains_undetermined (scev)
-&& (TREE_CODE (scev) != SSA_NAME
-   || !defined_in_sese_p (scev, region))
-&& (tree_does_not_contain_chrecs (scev)
-   || evolution_function_is_affine_p (scev))
-&& (! loop
-   || ! loop_in_sese_p (loop, region)
-   || ! chrec_contains_symbols_defined_in_loop (scev, loop->num));
+  

Re: [PATCH][GRAPHITE] Fix PR82449

2017-10-09 Thread Richard Biener
On Fri, 6 Oct 2017, Sebastian Pop wrote:

> On Fri, Oct 6, 2017 at 8:33 AM, Richard Biener  wrote:
> 
> > On Fri, 6 Oct 2017, Sebastian Pop wrote:
> >
> > > On Fri, Oct 6, 2017 at 6:56 AM, Richard Biener 
> > wrote:
> > >
> > > >
> > > > The following fences off a few more SCEVs through scev_analyzable_p
> > given
> > > > at the end we need those pass chrec_apply when getting a rename through
> > > > SCEV.
> > > >
> > > > The SCEV in question is
> > > >
> > > >   {(integer(kind=4)) {0, +, {1, +, 1}_1}_1, + 1}_2
> > > >
> > > > which fails to chrec_apply in the CHREC_LEFT part because that part
> > > > is not affine (and we're usually not replacing a IV with a constant
> > > > where chrec_apply might handle one or the other case).
> > > >
> > > > Bootstrapped and tested on x86_64-unknown-linux-gnu.
> > > >
> > > > This fixes three out of the remaining 8 codegen errors in SPEC CPU
> > 2006.
> > > >
> > > > Ok?
> > > >
> > > > Thanks,
> > > > Richard.
> > > >
> > > > 2017-10-06  Richard Biener  
> > > >
> > > > PR tree-optimization/82449
> > > > * sese.c (can_chrec_apply): New function.
> > > > (scev_analyzable_p): Check we can call chrec_apply on the SCEV.
> > > >
> > > > * gfortran.dg/graphite/pr82449.f: New testcase.
> >
> > >
> > > > Index: gcc/sese.c
> > > > ===
> > > > --- gcc/sese.c  (revision 253477)
> > > > +++ gcc/sese.c  (working copy)
> > > > @@ -421,6 +421,27 @@ invariant_in_sese_p_rec (tree t, const s
> > > >return true;
> > > >  }
> > > >
> > > > +/* Check whether we can call chrec_apply on CHREC with arbitrary X and
> > > > VAR.  */
> >
> > > +
> > > > +static bool
> > > > +can_chrec_apply (tree chrec)
> >
> 
> Could we use scev_is_linear_expression ?
> It seems like can_chrec_apply has the same semantics.

Looks like that works.

> 
> > > > +{
> > > > +  if (automatically_generated_chrec_p (chrec))
> > > > +return false;
> > > > +  switch (TREE_CODE (chrec))
> > > > +{
> > > > +case POLYNOMIAL_CHREC:
> > > > +  if (evolution_function_is_affine_p (chrec))
> > > > +   return (can_chrec_apply (CHREC_LEFT (chrec))
> > > > +   && can_chrec_apply (CHREC_RIGHT (chrec)));
> > > > +  return false;
> > > > +CASE_CONVERT:
> > > > +  return can_chrec_apply (TREE_OPERAND (chrec, 0));
> > > > +default:;
> > > > +  return tree_does_not_contain_chrecs (chrec);
> > > > +}
> > > > +}
> > > > +
> > > >  /* Return true when DEF can be analyzed in REGION by the scalar
> > > > evolution analyzer.  */
> > > >
> > > > @@ -449,6 +470,7 @@ scev_analyzable_p (tree def, sese_l 
> > > > || !defined_in_sese_p (scev, region))
> > > >  && (tree_does_not_contain_chrecs (scev)
> > > > || evolution_function_is_affine_p (scev))
> > > >
> > >
> > > Why isn't evolution_function_is_affine_p returning false on {0, +, {1, +,
> > > 1}_1}_1?
> > > This is quadratic.
> >
> > It returns false on that but the CHREC we ask it on is
> >
> > {(integer(kind=4)) {0, +, {1, +, 1}_1}_1, + 1}_2
> >
> > only the initial value is "quadratic".
> >
> 
> Right.
> If I understand correctly, the scop is the body of loop_1,
> and we do not need to represent the quadratic evolution
> of the initial value.

Giving the following full testing now.

Richard.

2017-10-09  Richard Biener  

PR tree-optimization/82449
* sese.c (scev_analyzable_p): Check whether the SCEV is linear.

* gfortran.dg/graphite/pr82449.f: New testcase.

Index: gcc/sese.c
===
--- gcc/sese.c  (revision 253486)
+++ gcc/sese.c  (working copy)
@@ -444,14 +444,13 @@ scev_analyzable_p (tree def, sese_l 
   loop = loop_containing_stmt (SSA_NAME_DEF_STMT (def));
   scev = scalar_evolution_in_region (region, loop, def);
 
-  return !chrec_contains_undetermined (scev)
-&& (TREE_CODE (scev) != SSA_NAME
-   || !defined_in_sese_p (scev, region))
-&& (tree_does_not_contain_chrecs (scev)
-   || evolution_function_is_affine_p (scev))
-&& (! loop
-   || ! loop_in_sese_p (loop, region)
-   || ! chrec_contains_symbols_defined_in_loop (scev, loop->num));
+  return (!chrec_contains_undetermined (scev)
+ && (TREE_CODE (scev) != SSA_NAME
+ || !defined_in_sese_p (scev, region))
+ && scev_is_linear_expression (scev)
+ && (! loop
+ || ! loop_in_sese_p (loop, region)
+ || ! chrec_contains_symbols_defined_in_loop (scev, loop->num)));
 }
 
 /* Returns the scalar evolution of T in REGION.  Every variable that
Index: gcc/testsuite/gfortran.dg/graphite/pr82449.f
===
--- gcc/testsuite/gfortran.dg/graphite/pr82449.f(nonexistent)
+++ gcc/testsuite/gfortran.dg/graphite/pr82449.f(working copy)
@@ -0,0 +1,11 @@
+! { 

Re: [PATCH][GRAPHITE] Fix PR82449

2017-10-06 Thread Sebastian Pop
On Fri, Oct 6, 2017 at 8:33 AM, Richard Biener  wrote:

> On Fri, 6 Oct 2017, Sebastian Pop wrote:
>
> > On Fri, Oct 6, 2017 at 6:56 AM, Richard Biener 
> wrote:
> >
> > >
> > > The following fences off a few more SCEVs through scev_analyzable_p
> given
> > > at the end we need those pass chrec_apply when getting a rename through
> > > SCEV.
> > >
> > > The SCEV in question is
> > >
> > >   {(integer(kind=4)) {0, +, {1, +, 1}_1}_1, + 1}_2
> > >
> > > which fails to chrec_apply in the CHREC_LEFT part because that part
> > > is not affine (and we're usually not replacing a IV with a constant
> > > where chrec_apply might handle one or the other case).
> > >
> > > Bootstrapped and tested on x86_64-unknown-linux-gnu.
> > >
> > > This fixes three out of the remaining 8 codegen errors in SPEC CPU
> 2006.
> > >
> > > Ok?
> > >
> > > Thanks,
> > > Richard.
> > >
> > > 2017-10-06  Richard Biener  
> > >
> > > PR tree-optimization/82449
> > > * sese.c (can_chrec_apply): New function.
> > > (scev_analyzable_p): Check we can call chrec_apply on the SCEV.
> > >
> > > * gfortran.dg/graphite/pr82449.f: New testcase.
>
> >
> > > Index: gcc/sese.c
> > > ===
> > > --- gcc/sese.c  (revision 253477)
> > > +++ gcc/sese.c  (working copy)
> > > @@ -421,6 +421,27 @@ invariant_in_sese_p_rec (tree t, const s
> > >return true;
> > >  }
> > >
> > > +/* Check whether we can call chrec_apply on CHREC with arbitrary X and
> > > VAR.  */
>
> > +
> > > +static bool
> > > +can_chrec_apply (tree chrec)
>

Could we use scev_is_linear_expression ?
It seems like can_chrec_apply has the same semantics.


> > > +{
> > > +  if (automatically_generated_chrec_p (chrec))
> > > +return false;
> > > +  switch (TREE_CODE (chrec))
> > > +{
> > > +case POLYNOMIAL_CHREC:
> > > +  if (evolution_function_is_affine_p (chrec))
> > > +   return (can_chrec_apply (CHREC_LEFT (chrec))
> > > +   && can_chrec_apply (CHREC_RIGHT (chrec)));
> > > +  return false;
> > > +CASE_CONVERT:
> > > +  return can_chrec_apply (TREE_OPERAND (chrec, 0));
> > > +default:;
> > > +  return tree_does_not_contain_chrecs (chrec);
> > > +}
> > > +}
> > > +
> > >  /* Return true when DEF can be analyzed in REGION by the scalar
> > > evolution analyzer.  */
> > >
> > > @@ -449,6 +470,7 @@ scev_analyzable_p (tree def, sese_l 
> > > || !defined_in_sese_p (scev, region))
> > >  && (tree_does_not_contain_chrecs (scev)
> > > || evolution_function_is_affine_p (scev))
> > >
> >
> > Why isn't evolution_function_is_affine_p returning false on {0, +, {1, +,
> > 1}_1}_1?
> > This is quadratic.
>
> It returns false on that but the CHREC we ask it on is
>
> {(integer(kind=4)) {0, +, {1, +, 1}_1}_1, + 1}_2
>
> only the initial value is "quadratic".
>

Right.
If I understand correctly, the scop is the body of loop_1,
and we do not need to represent the quadratic evolution
of the initial value.


Re: [PATCH][GRAPHITE] Fix PR82449

2017-10-06 Thread Richard Biener
On Fri, 6 Oct 2017, Sebastian Pop wrote:

> On Fri, Oct 6, 2017 at 6:56 AM, Richard Biener  wrote:
> 
> >
> > The following fences off a few more SCEVs through scev_analyzable_p given
> > at the end we need those pass chrec_apply when getting a rename through
> > SCEV.
> >
> > The SCEV in question is
> >
> >   {(integer(kind=4)) {0, +, {1, +, 1}_1}_1, + 1}_2
> >
> > which fails to chrec_apply in the CHREC_LEFT part because that part
> > is not affine (and we're usually not replacing a IV with a constant
> > where chrec_apply might handle one or the other case).
> >
> > Bootstrapped and tested on x86_64-unknown-linux-gnu.
> >
> > This fixes three out of the remaining 8 codegen errors in SPEC CPU 2006.
> >
> > Ok?
> >
> > Thanks,
> > Richard.
> >
> > 2017-10-06  Richard Biener  
> >
> > PR tree-optimization/82449
> > * sese.c (can_chrec_apply): New function.
> > (scev_analyzable_p): Check we can call chrec_apply on the SCEV.
> >
> > * gfortran.dg/graphite/pr82449.f: New testcase.
> >
> > Index: gcc/sese.c
> > ===
> > --- gcc/sese.c  (revision 253477)
> > +++ gcc/sese.c  (working copy)
> > @@ -421,6 +421,27 @@ invariant_in_sese_p_rec (tree t, const s
> >return true;
> >  }
> >
> > +/* Check whether we can call chrec_apply on CHREC with arbitrary X and
> > VAR.  */
> > +
> > +static bool
> > +can_chrec_apply (tree chrec)
> > +{
> > +  if (automatically_generated_chrec_p (chrec))
> > +return false;
> > +  switch (TREE_CODE (chrec))
> > +{
> > +case POLYNOMIAL_CHREC:
> > +  if (evolution_function_is_affine_p (chrec))
> > +   return (can_chrec_apply (CHREC_LEFT (chrec))
> > +   && can_chrec_apply (CHREC_RIGHT (chrec)));
> > +  return false;
> > +CASE_CONVERT:
> > +  return can_chrec_apply (TREE_OPERAND (chrec, 0));
> > +default:;
> > +  return tree_does_not_contain_chrecs (chrec);
> > +}
> > +}
> > +
> >  /* Return true when DEF can be analyzed in REGION by the scalar
> > evolution analyzer.  */
> >
> > @@ -449,6 +470,7 @@ scev_analyzable_p (tree def, sese_l 
> > || !defined_in_sese_p (scev, region))
> >  && (tree_does_not_contain_chrecs (scev)
> > || evolution_function_is_affine_p (scev))
> >
> 
> Why isn't evolution_function_is_affine_p returning false on {0, +, {1, +,
> 1}_1}_1?
> This is quadratic.

It returns false on that but the CHREC we ask it on is

{(integer(kind=4)) {0, +, {1, +, 1}_1}_1, + 1}_2

only the initial value is "quadratic".

Richard.


Re: [PATCH][GRAPHITE] Fix PR82449

2017-10-06 Thread Sebastian Pop
On Fri, Oct 6, 2017 at 6:56 AM, Richard Biener  wrote:

>
> The following fences off a few more SCEVs through scev_analyzable_p given
> at the end we need those pass chrec_apply when getting a rename through
> SCEV.
>
> The SCEV in question is
>
>   {(integer(kind=4)) {0, +, {1, +, 1}_1}_1, + 1}_2
>
> which fails to chrec_apply in the CHREC_LEFT part because that part
> is not affine (and we're usually not replacing a IV with a constant
> where chrec_apply might handle one or the other case).
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu.
>
> This fixes three out of the remaining 8 codegen errors in SPEC CPU 2006.
>
> Ok?
>
> Thanks,
> Richard.
>
> 2017-10-06  Richard Biener  
>
> PR tree-optimization/82449
> * sese.c (can_chrec_apply): New function.
> (scev_analyzable_p): Check we can call chrec_apply on the SCEV.
>
> * gfortran.dg/graphite/pr82449.f: New testcase.
>
> Index: gcc/sese.c
> ===
> --- gcc/sese.c  (revision 253477)
> +++ gcc/sese.c  (working copy)
> @@ -421,6 +421,27 @@ invariant_in_sese_p_rec (tree t, const s
>return true;
>  }
>
> +/* Check whether we can call chrec_apply on CHREC with arbitrary X and
> VAR.  */
> +
> +static bool
> +can_chrec_apply (tree chrec)
> +{
> +  if (automatically_generated_chrec_p (chrec))
> +return false;
> +  switch (TREE_CODE (chrec))
> +{
> +case POLYNOMIAL_CHREC:
> +  if (evolution_function_is_affine_p (chrec))
> +   return (can_chrec_apply (CHREC_LEFT (chrec))
> +   && can_chrec_apply (CHREC_RIGHT (chrec)));
> +  return false;
> +CASE_CONVERT:
> +  return can_chrec_apply (TREE_OPERAND (chrec, 0));
> +default:;
> +  return tree_does_not_contain_chrecs (chrec);
> +}
> +}
> +
>  /* Return true when DEF can be analyzed in REGION by the scalar
> evolution analyzer.  */
>
> @@ -449,6 +470,7 @@ scev_analyzable_p (tree def, sese_l 
> || !defined_in_sese_p (scev, region))
>  && (tree_does_not_contain_chrecs (scev)
> || evolution_function_is_affine_p (scev))
>

Why isn't evolution_function_is_affine_p returning false on {0, +, {1, +,
1}_1}_1?
This is quadratic.



> +&& can_chrec_apply (scev)
>  && (! loop
> || ! loop_in_sese_p (loop, region)
> || ! chrec_contains_symbols_defined_in_loop (scev, loop->num));
> Index: gcc/testsuite/gfortran.dg/graphite/pr82449.f
> ===
> --- gcc/testsuite/gfortran.dg/graphite/pr82449.f(nonexistent)
> +++ gcc/testsuite/gfortran.dg/graphite/pr82449.f(working copy)
> @@ -0,0 +1,11 @@
> +! { dg-do compile }
> +! { dg-options "-O2 -floop-nest-optimize" }
> +
> +  SUBROUTINE JDFIDX(MKL,KGSH)
> +  DIMENSION MKL(6,6)
> +  NKL=0
> +  400 DO 40 KG = 1,KGSH
> +  DO 40 LG = 1,KG
> +  NKL = NKL + 1
> +   40 MKL(LG,KG) = NKL
> +  END
>


Re: [PATCH] [graphite] translate reads and writes in a single traversal of memory ops

2017-10-06 Thread Sebastian Pop
On Fri, Oct 6, 2017 at 6:27 AM, Richard Biener 
wrote:
>
> > Richard, could you please commit this patch, as I will need to figure out
> > why my
> > ssh keys don't let me to commit the code.  I will probably need to update
> > the key.
>
> Done.  You probably still have a v1 key which were rejected after some
> point.
> I would guess you'll need to contact overseers to replace your key.
>

Thanks!


[PATCH][GRAPHITE] Fix PR82449

2017-10-06 Thread Richard Biener

The following fences off a few more SCEVs through scev_analyzable_p given
at the end we need those pass chrec_apply when getting a rename through
SCEV.

The SCEV in question is

  {(integer(kind=4)) {0, +, {1, +, 1}_1}_1, + 1}_2

which fails to chrec_apply in the CHREC_LEFT part because that part
is not affine (and we're usually not replacing a IV with a constant
where chrec_apply might handle one or the other case).

Bootstrapped and tested on x86_64-unknown-linux-gnu.

This fixes three out of the remaining 8 codegen errors in SPEC CPU 2006.

Ok?

Thanks,
Richard.

2017-10-06  Richard Biener  

PR tree-optimization/82449
* sese.c (can_chrec_apply): New function.
(scev_analyzable_p): Check we can call chrec_apply on the SCEV.

* gfortran.dg/graphite/pr82449.f: New testcase.

Index: gcc/sese.c
===
--- gcc/sese.c  (revision 253477)
+++ gcc/sese.c  (working copy)
@@ -421,6 +421,27 @@ invariant_in_sese_p_rec (tree t, const s
   return true;
 }
 
+/* Check whether we can call chrec_apply on CHREC with arbitrary X and VAR.  */
+
+static bool
+can_chrec_apply (tree chrec)
+{
+  if (automatically_generated_chrec_p (chrec))
+return false;
+  switch (TREE_CODE (chrec))
+{
+case POLYNOMIAL_CHREC:
+  if (evolution_function_is_affine_p (chrec))
+   return (can_chrec_apply (CHREC_LEFT (chrec))
+   && can_chrec_apply (CHREC_RIGHT (chrec)));
+  return false;
+CASE_CONVERT:
+  return can_chrec_apply (TREE_OPERAND (chrec, 0));
+default:;
+  return tree_does_not_contain_chrecs (chrec);
+}
+}
+
 /* Return true when DEF can be analyzed in REGION by the scalar
evolution analyzer.  */
 
@@ -449,6 +470,7 @@ scev_analyzable_p (tree def, sese_l 
|| !defined_in_sese_p (scev, region))
 && (tree_does_not_contain_chrecs (scev)
|| evolution_function_is_affine_p (scev))
+&& can_chrec_apply (scev)
 && (! loop
|| ! loop_in_sese_p (loop, region)
|| ! chrec_contains_symbols_defined_in_loop (scev, loop->num));
Index: gcc/testsuite/gfortran.dg/graphite/pr82449.f
===
--- gcc/testsuite/gfortran.dg/graphite/pr82449.f(nonexistent)
+++ gcc/testsuite/gfortran.dg/graphite/pr82449.f(working copy)
@@ -0,0 +1,11 @@
+! { dg-do compile }
+! { dg-options "-O2 -floop-nest-optimize" }
+
+  SUBROUTINE JDFIDX(MKL,KGSH)
+  DIMENSION MKL(6,6)
+  NKL=0
+  400 DO 40 KG = 1,KGSH
+  DO 40 LG = 1,KG
+  NKL = NKL + 1
+   40 MKL(LG,KG) = NKL
+  END


Re: [PATCH] [graphite] translate reads and writes in a single traversal of memory ops

2017-10-06 Thread Richard Biener
On Thu, Oct 5, 2017 at 4:27 PM, Sebastian Pop  wrote:
>
>
> On Mon, Oct 2, 2017 at 4:18 AM, Richard Biener 
> wrote:
>>
>> On Mon, Oct 2, 2017 at 6:53 AM, Sebastian Pop 
>> wrote:
>> > The patch moves the code that translates reads and writes to isl
>> > representation
>> > in a same loop.  This is to avoid traversing the scop blocks and arrays
>> > with
>> > memory operations 3 times.
>>
>> LGTM.
>
>
> Richard, could you please commit this patch, as I will need to figure out
> why my
> ssh keys don't let me to commit the code.  I will probably need to update
> the key.

Done.  You probably still have a v1 key which were rejected after some point.
I would guess you'll need to contact overseers to replace your key.

Richard.

> Thanks,
> Sebastian
>


Re: [PATCH][GRAPHITE] Rewrite PHI handling in code-gen

2017-10-05 Thread Sebastian Pop
On Thu, Oct 5, 2017 at 9:20 AM, Sebastian Pop  wrote:
>
> We also need to tag commutative and associative reductions
> in the dependence graph.  Now that the code generation will
> nicely handle scalar dependences, we may want to add back
> some of the code from this commit:
> https://gcc.gnu.org/viewcvs/gcc?view=revision=228255
>
>
The above patch is for tagging the assoc/comm reductions.
Here is the patch that removes reduction dependences
that can be ignored to compute a new schedule:
https://gcc.gnu.org/viewcvs/gcc?view=revision=228530


Re: [PATCH] [graphite] translate reads and writes in a single traversal of memory ops

2017-10-05 Thread Sebastian Pop
On Mon, Oct 2, 2017 at 4:18 AM, Richard Biener 
wrote:

> On Mon, Oct 2, 2017 at 6:53 AM, Sebastian Pop 
> wrote:
> > The patch moves the code that translates reads and writes to isl
> representation
> > in a same loop.  This is to avoid traversing the scop blocks and arrays
> with
> > memory operations 3 times.
>
> LGTM.
>

Richard, could you please commit this patch, as I will need to figure out
why my
ssh keys don't let me to commit the code.  I will probably need to update
the key.

Thanks,
Sebastian


Re: [PATCH][GRAPHITE] Adjust CASE_CONVERT in extract_affine

2017-10-05 Thread Sebastian Pop
On Wed, Oct 4, 2017 at 2:45 AM, Richard Biener  wrote:

>
> While my last change involving signed types was correct it wasn't optimal.
> We can avoid the modulo constraints if the conversion is widening
> (thus all values fit in the new type).
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, ok?


> Thanks,
> Richard.
>
> 2017-10-04  Richard Biener  
>
> * graphite-sese-to-poly.c (extract_affine): For casts increasing
> precision do not perform modulo reduction.
>

Looks good.
Thanks!


Re: [PATCH][GRAPHITE] Rewrite PHI handling in code-gen

2017-10-05 Thread Sebastian Pop
On Thu, Oct 5, 2017 at 6:43 AM, Richard Biener  wrote:

> On Wed, 4 Oct 2017, Richard Biener wrote:
>
> >
> > The following patch completely re-does PHI handling during
> > code-generation.  PHI handling is currently responsible for 99% of
> > all code-generation issues.  With the patch the number of code-generation
> > issues in SPEC 2k6 decreases from 180 to 5, similar adjustments happen
> > to the testsuite - only gfortran.dg/graphite has some expected code-gen
> > issues left.
>
> So I messed up the testsuite update and it turns out all code-gen
> issues are fixed in all graphite.exp testsuites.  Yay.
>
> > The current idea of categorizing PHIs and doing code-gen based on
> > pattern matching with the original GIMPLE IL isn't feasible given
> > ISL can do transforms like peeling, optimizing away conditions and
> > creating arbitrary number of GBB copies.  The current code fences
> > off a lot of cases by simply giving up.
> >
> > To fix the current code one would need to basically replicate the
> > update-SSA machinery we already have (and pointlessly exercise
> > from the graphite code-gen at the moment).
> >
> > Thus the patch rips out all manual handling of PHIs during
> code-generation
> > and leaves all cross-BB scalar updates to update-SSA.
> >
> > This means "going out-of-SSA" again, but instead of applying out-of-SSA
> > on the original GIMPLE IL I'm just doing this on-the-fly during
> > scalar dependence generation and code generation.
>

Sounds good!


> >
> >  bb3:
> >   goto bb5;
> >
> >  bb4:
> >
> >  bb5:
> >   _2 = PHI <_3(3), _4(4)>
> >
> > becomes (for an identity rewrite) before update-SSA:
> >
> >  bb3':
> >   D.1234 = _3;
> >
> >  bb4':
> >   D.1234 = _4;
> >
> >  bb5':
> >   _5 = D.1234;
> >
> > with _5 being a new def for _2.  update-SSA then re-writes the
> > _3 and _4 uses with available new defs we have registered during
> > code generation of the _3 and _4 def copies and rewrites D.1234
> > into SSA, inserting PHIs where necessary.
> >
> > This scheme of course relies on ISL outputting a correct schedule
> > which of course relies on us setting proper dependence constraints.
> > I've fixed quite a few issues there, for example missing constraints
> > for the SESE liveout variables.
> >
> > One awkward thing is that to not confuse ISL with PHI edge copies
> > placed in latch blocks, like
> >
> >   for (int c0 = 0; c0 < P_22; c0 += 1) {
> > S_6(0, c0);
> > if (P_22 >= c0 + 2)
> >   S_7(0, c0);
> >   }
> >
> > and ISL then happilly peeling off the last iteration where the latch S_7
> > containing only the out-of-SSA copy is not needed.  So I'm trying to
> > detect empty latches and instead insert the out-of-SSA copy in its
> > predecessor instead (I know it doesn't matter if we execute the stmt
> > in the last iteration).
> >
> > The patch as-is ends up with quite some useless SSA copies which
> > is cleaned up by the copyprop pass inside the graphite pipeline
> > but can also be improved by teaching the into-SSA rewrite to
> > eliminate copies.
> >
> > I do expect issues with the patch (I'm seeing CE 416.gamess, but not
> > sure why and 403.gcc miscompare), but it's already become somewhat too
> big
> > to handle.
>
> The CE was on purpose, std=legacy brings it back, now it seems to
> miscompare.
>
> > Currently re-bootstrapping and testing after some cosmetic changes,
> > testsuites ran successfully, SPEC CPU 2006 built and run (with test
> data).
> > Statistics (with all graphite params set to unlimited) are
> >
> > loop nest optimized: 119
> > loop nest not optimized, code generation error: 5
> > loop nest not optimized, optimized schedule is identical to original
> > schedule: 110
> > loop nest not optimized, optimization timed out: 31
> > loop nest not optimized, ISL signalled an error: 6
> > loop nest: 271
> >
> > Ok for trunk?
>
> Thus the following updated patch (only the testsuite part changed).
>
> Bootstrapped with -fgraphite-identity -floop-nest-optimize and tested
> on x86_64-unknown-linux-gnu.
>
> Ok?
>

Yes, the patch looks good.  Thanks!


>
> There are two parts worth working on after this - one is creating
> a versioning condition to fend off alias-set compute fails, the
> other one is working on the proximity constraints.
>

Right.
We also need to tag commutative and associative reductions
in the dependence graph.  Now that the code generation will
nicely handle scalar dependences, we may want to add back
some of the code from this commit:
https://gcc.gnu.org/viewcvs/gcc?view=revision=228255


Re: [PATCH][GRAPHITE] Rewrite PHI handling in code-gen

2017-10-05 Thread Richard Biener
On Wed, 4 Oct 2017, Richard Biener wrote:

> 
> The following patch completely re-does PHI handling during 
> code-generation.  PHI handling is currently responsible for 99% of
> all code-generation issues.  With the patch the number of code-generation
> issues in SPEC 2k6 decreases from 180 to 5, similar adjustments happen
> to the testsuite - only gfortran.dg/graphite has some expected code-gen
> issues left.

So I messed up the testsuite update and it turns out all code-gen
issues are fixed in all graphite.exp testsuites.  Yay.

> The current idea of categorizing PHIs and doing code-gen based on
> pattern matching with the original GIMPLE IL isn't feasible given
> ISL can do transforms like peeling, optimizing away conditions and
> creating arbitrary number of GBB copies.  The current code fences
> off a lot of cases by simply giving up.
> 
> To fix the current code one would need to basically replicate the
> update-SSA machinery we already have (and pointlessly exercise
> from the graphite code-gen at the moment).
> 
> Thus the patch rips out all manual handling of PHIs during code-generation
> and leaves all cross-BB scalar updates to update-SSA.
> 
> This means "going out-of-SSA" again, but instead of applying out-of-SSA
> on the original GIMPLE IL I'm just doing this on-the-fly during
> scalar dependence generation and code generation.
> 
>  bb3:
>   goto bb5;
> 
>  bb4:
> 
>  bb5:
>   _2 = PHI <_3(3), _4(4)>
> 
> becomes (for an identity rewrite) before update-SSA:
> 
>  bb3':
>   D.1234 = _3;
> 
>  bb4':
>   D.1234 = _4;
> 
>  bb5':
>   _5 = D.1234;
> 
> with _5 being a new def for _2.  update-SSA then re-writes the
> _3 and _4 uses with available new defs we have registered during
> code generation of the _3 and _4 def copies and rewrites D.1234
> into SSA, inserting PHIs where necessary.
> 
> This scheme of course relies on ISL outputting a correct schedule
> which of course relies on us setting proper dependence constraints.
> I've fixed quite a few issues there, for example missing constraints
> for the SESE liveout variables.
> 
> One awkward thing is that to not confuse ISL with PHI edge copies
> placed in latch blocks, like
> 
>   for (int c0 = 0; c0 < P_22; c0 += 1) {
> S_6(0, c0);
> if (P_22 >= c0 + 2)
>   S_7(0, c0);
>   }
> 
> and ISL then happilly peeling off the last iteration where the latch S_7
> containing only the out-of-SSA copy is not needed.  So I'm trying to
> detect empty latches and instead insert the out-of-SSA copy in its
> predecessor instead (I know it doesn't matter if we execute the stmt
> in the last iteration).
> 
> The patch as-is ends up with quite some useless SSA copies which
> is cleaned up by the copyprop pass inside the graphite pipeline
> but can also be improved by teaching the into-SSA rewrite to
> eliminate copies.
> 
> I do expect issues with the patch (I'm seeing CE 416.gamess, but not
> sure why and 403.gcc miscompare), but it's already become somewhat too big 
> to handle.

The CE was on purpose, std=legacy brings it back, now it seems to
miscompare.

> Currently re-bootstrapping and testing after some cosmetic changes,
> testsuites ran successfully, SPEC CPU 2006 built and run (with test data).
> Statistics (with all graphite params set to unlimited) are
> 
> loop nest optimized: 119
> loop nest not optimized, code generation error: 5
> loop nest not optimized, optimized schedule is identical to original 
> schedule: 110
> loop nest not optimized, optimization timed out: 31
> loop nest not optimized, ISL signalled an error: 6
> loop nest: 271
> 
> Ok for trunk?

Thus the following updated patch (only the testsuite part changed).

Bootstrapped with -fgraphite-identity -floop-nest-optimize and tested
on x86_64-unknown-linux-gnu.

Ok?

There are two parts worth working on after this - one is creating
a versioning condition to fend off alias-set compute fails, the
other one is working on the proximity constraints.

After this is in I'll extract testcases for the remaining
code-generation issues in SPEC.

Thanks,
Richard.

2017-10-05  Richard Biener  

* graphite-isl-ast-to-gimple.c: Include ssa.h and tree-ssa.h.
(translate_isl_ast_to_gimple::translate_pending_phi_nodes,
translate_isl_ast_to_gimple::is_valid_rename,
translate_isl_ast_to_gimple::get_rename,
translate_isl_ast_to_gimple::get_def_bb_for_const,
translate_isl_ast_to_gimple::get_new_name,
translate_isl_ast_to_gimple::collect_all_ssa_names,
translate_isl_ast_to_gimple::copy_loop_phi_args,
translate_isl_ast_to_gimple::collect_all_ssa_names,
translate_isl_ast_to_gimple::copy_loop_phi_args,
translate_isl_ast_to_gimple::copy_loop_phi_nodes,
translate_isl_ast_to_gimple::add_close_phis_to_merge_points,
translate_isl_ast_to_gimple::add_close_phis_to_outer_loops,
translate_isl_ast_to_gimple::copy_loop_close_phi_args,

[PATCH][GRAPHITE] Rewrite PHI handling in code-gen

2017-10-04 Thread Richard Biener

The following patch completely re-does PHI handling during 
code-generation.  PHI handling is currently responsible for 99% of
all code-generation issues.  With the patch the number of code-generation
issues in SPEC 2k6 decreases from 180 to 5, similar adjustments happen
to the testsuite - only gfortran.dg/graphite has some expected code-gen
issues left.

The current idea of categorizing PHIs and doing code-gen based on
pattern matching with the original GIMPLE IL isn't feasible given
ISL can do transforms like peeling, optimizing away conditions and
creating arbitrary number of GBB copies.  The current code fences
off a lot of cases by simply giving up.

To fix the current code one would need to basically replicate the
update-SSA machinery we already have (and pointlessly exercise
from the graphite code-gen at the moment).

Thus the patch rips out all manual handling of PHIs during code-generation
and leaves all cross-BB scalar updates to update-SSA.

This means "going out-of-SSA" again, but instead of applying out-of-SSA
on the original GIMPLE IL I'm just doing this on-the-fly during
scalar dependence generation and code generation.

 bb3:
  goto bb5;

 bb4:

 bb5:
  _2 = PHI <_3(3), _4(4)>

becomes (for an identity rewrite) before update-SSA:

 bb3':
  D.1234 = _3;

 bb4':
  D.1234 = _4;

 bb5':
  _5 = D.1234;

with _5 being a new def for _2.  update-SSA then re-writes the
_3 and _4 uses with available new defs we have registered during
code generation of the _3 and _4 def copies and rewrites D.1234
into SSA, inserting PHIs where necessary.

This scheme of course relies on ISL outputting a correct schedule
which of course relies on us setting proper dependence constraints.
I've fixed quite a few issues there, for example missing constraints
for the SESE liveout variables.

One awkward thing is that to not confuse ISL with PHI edge copies
placed in latch blocks, like

  for (int c0 = 0; c0 < P_22; c0 += 1) {
S_6(0, c0);
if (P_22 >= c0 + 2)
  S_7(0, c0);
  }

and ISL then happilly peeling off the last iteration where the latch S_7
containing only the out-of-SSA copy is not needed.  So I'm trying to
detect empty latches and instead insert the out-of-SSA copy in its
predecessor instead (I know it doesn't matter if we execute the stmt
in the last iteration).

The patch as-is ends up with quite some useless SSA copies which
is cleaned up by the copyprop pass inside the graphite pipeline
but can also be improved by teaching the into-SSA rewrite to
eliminate copies.

I do expect issues with the patch (I'm seeing CE 416.gamess, but not
sure why and 403.gcc miscompare), but it's already become somewhat too big 
to handle.

Currently re-bootstrapping and testing after some cosmetic changes,
testsuites ran successfully, SPEC CPU 2006 built and run (with test data).
Statistics (with all graphite params set to unlimited) are

loop nest optimized: 119
loop nest not optimized, code generation error: 5
loop nest not optimized, optimized schedule is identical to original 
schedule: 110
loop nest not optimized, optimization timed out: 31
loop nest not optimized, ISL signalled an error: 6
loop nest: 271

Ok for trunk?

Thanks,
Richard.

2017-10-04  Richard Biener  

* graphite-isl-ast-to-gimple.c: Include ssa.h and tree-ssa.h.
(translate_isl_ast_to_gimple::translate_pending_phi_nodes,
translate_isl_ast_to_gimple::is_valid_rename,
translate_isl_ast_to_gimple::get_rename,
translate_isl_ast_to_gimple::get_def_bb_for_const,
translate_isl_ast_to_gimple::get_new_name,
translate_isl_ast_to_gimple::collect_all_ssa_names,
translate_isl_ast_to_gimple::copy_loop_phi_args,
translate_isl_ast_to_gimple::collect_all_ssa_names,
translate_isl_ast_to_gimple::copy_loop_phi_args,
translate_isl_ast_to_gimple::copy_loop_phi_nodes,
translate_isl_ast_to_gimple::add_close_phis_to_merge_points,
translate_isl_ast_to_gimple::add_close_phis_to_outer_loops,
translate_isl_ast_to_gimple::copy_loop_close_phi_args,
translate_isl_ast_to_gimple::copy_loop_close_phi_nodes,
translate_isl_ast_to_gimple::copy_cond_phi_args,
translate_isl_ast_to_gimple::copy_cond_phi_nodes,
translate_isl_ast_to_gimple::edge_for_new_close_phis,
translate_isl_ast_to_gimple::add_phi_arg_for_new_expr,
translate_isl_ast_to_gimple::rename_uses,
translate_isl_ast_to_gimple::rename_all_uses): Remove.
(translate_isl_ast_to_gimple::get_rename_from_scev): Simplify.
(set_rename_for_each_def): Likewise.
(graphite_copy_stmts_from_block): Handle debug stmt resetting
here.  Handle rewriting SCEV analyzable uses here.
(copy_bb_and_scalar_dependences): Generate code for PHI
copy-in/outs.
(graphite_regenerate_ast_isl): Adjust.
* graphite-scop-detection.c (trivially_empty_bb_p): Move to sese.[ch].
(add_write, add_read): New 

[PATCH][GRAPHITE] Adjust CASE_CONVERT in extract_affine

2017-10-04 Thread Richard Biener

While my last change involving signed types was correct it wasn't optimal.
We can avoid the modulo constraints if the conversion is widening
(thus all values fit in the new type).

Bootstrapped and tested on x86_64-unknown-linux-gnu, ok?

Thanks,
Richard.

2017-10-04  Richard Biener  

* graphite-sese-to-poly.c (extract_affine): For casts increasing
precision do not perform modulo reduction.

Index: gcc/graphite-sese-to-poly.c
===
--- gcc/graphite-sese-to-poly.c (revision 253336)
+++ gcc/graphite-sese-to-poly.c (working copy)
@@ -299,11 +299,18 @@ extract_affine (scop_p s, tree e, __isl_
   return res;
 
 CASE_CONVERT:
-  res = extract_affine (s, TREE_OPERAND (e, 0), space);
-  /* signed values, even if overflow is undefined, get modulo-reduced.  */
-  if (! TYPE_UNSIGNED (type))
-   res = wrap (res, TYPE_PRECISION (type) - 1);
-  break;
+  {
+   tree itype = TREE_TYPE (TREE_OPERAND (e, 0));
+   res = extract_affine (s, TREE_OPERAND (e, 0), space);
+   /* Signed values, even if overflow is undefined, get modulo-reduced.
+  But only if not all values of the old type fit in the new.  */
+   if (! TYPE_UNSIGNED (type)
+   && ((TYPE_UNSIGNED (TREE_TYPE (TREE_OPERAND (e, 0)))
+&& TYPE_PRECISION (type) <= TYPE_PRECISION (itype))
+   || TYPE_PRECISION (type) < TYPE_PRECISION (itype)))
+ res = wrap (res, TYPE_PRECISION (type) - 1);
+   break;
+  }
 
 case NON_LVALUE_EXPR:
   res = extract_affine (s, TREE_OPERAND (e, 0), space);



Re: [PATCH][GRAPHITE] Test for code generation errors

2017-10-04 Thread Richard Biener
uOn Tue, 3 Oct 2017, Rainer Orth wrote:

> Hi Richard,
> 
> > What ISL Versions are affected? 
> 
> it's 0.18.
> 
> >>Besides, there's
> >>
> >>+UNRESOLVED: gfortran.dg/graphite/pr42393-1.f90   -O  
> >>scan-tree-dump-times graphite "code generation error" 1
> >>
> >>for both 32 and 64-bit.  The log indicates
> >>
> >>gfortran.dg/graphite/pr42393-1.f90   -O  : dump file does not exist
> >>
> >>and indeed the test lacks -fdump-tree-graphite-details contrary to
> >>ChangeLog.
> >
> > Oops, I must have missed that one. Will fix tomorrow. 
> 
> Fine, thanks.

Fixed as follows, tested on x86_64-unknown-linux-gnu.

Richard.

2017-10-04  Richard Biener  

* gfortran.dg/graphite/id-17.f: For ilp32 allow graphite codegen
errors and scan for one.
* gfortran.dg/graphite/id-19.f: Likewise.
* gfortran.dg/graphite/pr29832.f90: Likewise.
* gfortran.dg/graphite/pr42326-1.f90: Likewise.
* gfortran.dg/graphite/pr42326.f90: Likewise.
* gfortran.dg/graphite/pr68550-2.f90: Likewise.
* gfortran.dg/graphite/run-id-2.f90: Likewise.
* gfortran.dg/graphite/run-id-3.f90: Likewise.
* gfortran.dg/graphite/pr42393-1.f90: Dump graphite.


Index: gcc/testsuite/gfortran.dg/graphite/id-17.f
===
--- gcc/testsuite/gfortran.dg/graphite/id-17.f  (revision 253393)
+++ gcc/testsuite/gfortran.dg/graphite/id-17.f  (working copy)
@@ -1,3 +1,4 @@
+! { dg-additional-options "-fdump-tree-graphite-details --param 
graphite-allow-codegen-errors=1" { target ilp32 } }
   SUBROUTINE SPECTOP(Dr,N)
   DIMENSION d1(0:32,0:32) , Dr(0:32,0:32) , x(0:32)
   DO k = 0 , N
@@ -14,3 +15,4 @@
  ENDDO
   ENDDO
   END
+! { dg-final { scan-tree-dump-times "code generation error" 1 " graphite" { 
target ilp32 } } }
Index: gcc/testsuite/gfortran.dg/graphite/id-19.f
===
--- gcc/testsuite/gfortran.dg/graphite/id-19.f  (revision 253393)
+++ gcc/testsuite/gfortran.dg/graphite/id-19.f  (working copy)
@@ -1,3 +1,4 @@
+! { dg-additional-options "-fdump-tree-graphite-details --param 
graphite-allow-codegen-errors=1" { target ilp32 } }
   SUBROUTINE ECCODR(FPQR)
   DIMENSION FPQR(25,25,25)
   INTEGER P,Q,R
@@ -13,3 +14,4 @@
   140QM2= QM2+TWO
   150 PM2= PM2+TWO
   END
+! { dg-final { scan-tree-dump-times "code generation error" 1 " graphite" { 
target ilp32 } } }
Index: gcc/testsuite/gfortran.dg/graphite/pr29832.f90
===
--- gcc/testsuite/gfortran.dg/graphite/pr29832.f90  (revision 253393)
+++ gcc/testsuite/gfortran.dg/graphite/pr29832.f90  (working copy)
@@ -1,5 +1,6 @@
 ! { dg-do run }
 ! { dg-options "-O2 -ftree-loop-linear" }
+! { dg-additional-options "-fdump-tree-graphite-details --param 
graphite-allow-codegen-errors=1" { target ilp32 } }
 
 ! Program to test the scalarizer
 program testarray
@@ -24,3 +25,4 @@ program testarray
end do
 end program
 
+! { dg-final { scan-tree-dump-times "code generation error" 1 " graphite" { 
target ilp32 } } }
Index: gcc/testsuite/gfortran.dg/graphite/pr42326-1.f90
===
--- gcc/testsuite/gfortran.dg/graphite/pr42326-1.f90(revision 253393)
+++ gcc/testsuite/gfortran.dg/graphite/pr42326-1.f90(working copy)
@@ -1,7 +1,7 @@
 ! { dg-do compile { target i?86-*-* x86_64-*-* } }
 ! { dg-require-effective-target ilp32 }
 ! { dg-require-effective-target sse2 }
-! { dg-options "-O2 -floop-parallelize-all -fprefetch-loop-arrays -msse2" }
+! { dg-options "-O2 -floop-parallelize-all -fprefetch-loop-arrays -msse2 
-fdump-tree-graphite-details --param graphite-allow-codegen-errors=1" }
 
 subroutine phasad(t,i,ium)
   implicit none
@@ -17,3 +17,4 @@ subroutine phasad(t,i,ium)
   return
 end subroutine phasad
 
+! { dg-final { scan-tree-dump-times "code generation error" 1 " graphite" } }
Index: gcc/testsuite/gfortran.dg/graphite/pr42326.f90
===
--- gcc/testsuite/gfortran.dg/graphite/pr42326.f90  (revision 253393)
+++ gcc/testsuite/gfortran.dg/graphite/pr42326.f90  (working copy)
@@ -1,7 +1,7 @@
 ! { dg-do compile { target i?86-*-* x86_64-*-* } }
 ! { dg-require-effective-target ilp32 }
 ! { dg-require-effective-target sse2 }
-! { dg-options "-O2 -floop-strip-mine -fprefetch-loop-arrays -msse2" }
+! { dg-options "-O2 -floop-strip-mine -fprefetch-loop-arrays -msse2 
-fdump-tree-graphite-details --param graphite-allow-codegen-errors=1" }
 
 subroutine blts ( ldmx, ldmy, v, tmp1, i, j, k)
   implicit none
@@ -34,3 +34,4 @@ subroutine phasad(t,i,ium)
   return
 end subroutine phasad
 
+! { dg-final { scan-tree-dump-times "code generation error" 1 " graphite" } }
Index: gcc/testsuite/gfortran.dg/graphite/pr42393-1.f90

Re: [PATCH][GRAPHITE] Test for code generation errors

2017-10-03 Thread Rainer Orth
Hi Richard,

> What ISL Versions are affected? 

it's 0.18.

>>Besides, there's
>>
>>+UNRESOLVED: gfortran.dg/graphite/pr42393-1.f90   -O  
>>scan-tree-dump-times graphite "code generation error" 1
>>
>>for both 32 and 64-bit.  The log indicates
>>
>>gfortran.dg/graphite/pr42393-1.f90   -O  : dump file does not exist
>>
>>and indeed the test lacks -fdump-tree-graphite-details contrary to
>>ChangeLog.
>
> Oops, I must have missed that one. Will fix tomorrow. 

Fine, thanks.

Rainer

-- 
-
Rainer Orth, Center for Biotechnology, Bielefeld University


Re: [PATCH][GRAPHITE] Test for code generation errors

2017-10-03 Thread Richard Biener
On October 3, 2017 11:48:35 AM GMT+02:00, Rainer Orth 
 wrote:
>Hi Richard,
>
>> The following patch adjust GRAPHITE testing to check that existing
>> code generation issues occur and makes code generation ICE with
>> -fchecking --param graphite-allow-codegen-errors=0.  The param
>> is really a testsuite artifact so we can have testcases with
>> issues where we have papered over GRAPHITE issues with aborting
>> code generation.
>>
>> This avoids regressing testcases that do not show code generation
>> issues and it allows detecting testcases that no longer show
>> code generation issues (so we can avoid regressing that feat later).
>>
>> I'm now working on code-generation issues so that's an important
>> feature for me.
>>
>> Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to
>trunk.
>>
>> Richard.
>>
>> 2017-10-02  Richard Biener  
>>
>>  * graphite-isl-ast-to-gimple.c (set_codegen_error): With
>>  -fchecking and --param graphite-allow-codegen-errors=0 ICE.
>>  * params.def (PARAM_GRAPHITE_ALLOW_CODEGEN_ERRORS): New param.
>>
>>  * gcc.dg/graphite/graphite.exp: Add -fdump-tree-graphite-details.
>>  * gcc.dg/graphite/id-16.c: Adjust for existing codegen errors.
>>  * gcc.dg/graphite/pr46168.c: Likewise.
>>  * gcc.dg/graphite/pr68756.c: Likewise.
>>  * gcc.dg/graphite/pr69728.c: Likewise.
>>  * gcc.dg/graphite/pr71575-2.c: Likewise.
>>  * gcc.dg/graphite/pr77362.c: Likewise.
>>  * gcc.dg/graphite/pr81373.c: Likewise.
>>  * gcc.dg/graphite/run-id-pr67700-1.c: Likewise.
>>  * gfortran.dg/graphite/interchange-1.f: Likewise.
>>  * gfortran.dg/graphite/pr29581.f90: Likewise.
>>  * gfortran.dg/graphite/pr42334-1.f: Likewise.
>>  * gfortran.dg/graphite/pr42393-1.f90: Likewise.
>>  * gfortran.dg/graphite/pr42393.f90: Likewise.
>>  * gfortran.dg/graphite/pr47019.f: Likewise.
>
>the patch caused a couple of regressions, unfortunately.  E.g.
>
>+FAIL: gfortran.dg/graphite/id-17.f   -O  (internal compiler error)
>+FAIL: gfortran.dg/graphite/id-17.f   -O  (test for excess errors)
>
>Excess errors:
>during GIMPLE pass: graphite
>/vol/gcc/src/hg/trunk/local/gcc/testsuite/gfortran.dg/graphite/id-17.f:1:0:
>internal compiler error: in set_codegen_error, at
>graphite-isl-ast-to-gimple.c:248
>0x10d5ebb translate_isl_ast_to_gimple::set_codegen_error()
>   /vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:247
>0x10d0023 translate_isl_ast_to_gimple::set_codegen_error()
>   /vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:247
>0x10d0023 translate_isl_ast_to_gimple::get_rename_from_scev(tree_node*,
>gimple**, loop*, basic_block_def*, basic_block_def*, vecva_heap, vl_ptr>)
>  /vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:1510
>0x10d2963 translate_isl_ast_to_gimple::rename_uses(gimple*,
>gimple_stmt_iterator*, basic_block_def*, loop*, vecva_heap, vl_ptr>)
>  /vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:1644
>0x10d4217
>translate_isl_ast_to_gimple::graphite_copy_stmts_from_block(basic_block_def*,
>basic_block_def*, vec)
>  /vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:2516
>0x10d4653
>translate_isl_ast_to_gimple::copy_bb_and_scalar_dependences(basic_block_def*,
>edge_def*, vec)
>  /vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:2717
>0x10d4ecb
>translate_isl_ast_to_gimple::translate_isl_ast_node_user(isl_ast_node*,
>edge_def*, std::map,
>std::allocator > >&)
>   /vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:835
>0x10d5147
>translate_isl_ast_to_gimple::translate_isl_ast_for_loop(loop*,
>isl_ast_node*, edge_def*, tree_node*, tree_node*, tree_node*,
>std::map,
>std::allocator > >&)
>   /vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:649
>0x10d53ab
>translate_isl_ast_to_gimple::translate_isl_ast_node_for(loop*,
>isl_ast_node*, edge_def*, std::mapstd::less, std::allocator
>> >&)
>   /vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:752
>0x10d5477
>translate_isl_ast_to_gimple::translate_isl_ast_node_block(loop*,
>isl_ast_node*, edge_def*, std::mapstd::less, std::allocator
>> >&)
>   /vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:864
>0x10d5147
>translate_isl_ast_to_gimple::translate_isl_ast_for_loop(loop*,
>isl_ast_node*, edge_def*, tree_node*, tree_node*, tree_node*,
>std::map,
>std::allocator > >&)
>   /vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:649
>0x10d53ab

Re: [PATCH][GRAPHITE] Test for code generation errors

2017-10-03 Thread Rainer Orth
Hi Richard,

> The following patch adjust GRAPHITE testing to check that existing
> code generation issues occur and makes code generation ICE with
> -fchecking --param graphite-allow-codegen-errors=0.  The param
> is really a testsuite artifact so we can have testcases with
> issues where we have papered over GRAPHITE issues with aborting
> code generation.
>
> This avoids regressing testcases that do not show code generation
> issues and it allows detecting testcases that no longer show
> code generation issues (so we can avoid regressing that feat later).
>
> I'm now working on code-generation issues so that's an important
> feature for me.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.
>
> Richard.
>
> 2017-10-02  Richard Biener  
>
>   * graphite-isl-ast-to-gimple.c (set_codegen_error): With
>   -fchecking and --param graphite-allow-codegen-errors=0 ICE.
>   * params.def (PARAM_GRAPHITE_ALLOW_CODEGEN_ERRORS): New param.
>
>   * gcc.dg/graphite/graphite.exp: Add -fdump-tree-graphite-details.
>   * gcc.dg/graphite/id-16.c: Adjust for existing codegen errors.
>   * gcc.dg/graphite/pr46168.c: Likewise.
>   * gcc.dg/graphite/pr68756.c: Likewise.
>   * gcc.dg/graphite/pr69728.c: Likewise.
>   * gcc.dg/graphite/pr71575-2.c: Likewise.
>   * gcc.dg/graphite/pr77362.c: Likewise.
>   * gcc.dg/graphite/pr81373.c: Likewise.
>   * gcc.dg/graphite/run-id-pr67700-1.c: Likewise.
>   * gfortran.dg/graphite/interchange-1.f: Likewise.
>   * gfortran.dg/graphite/pr29581.f90: Likewise.
>   * gfortran.dg/graphite/pr42334-1.f: Likewise.
>   * gfortran.dg/graphite/pr42393-1.f90: Likewise.
>   * gfortran.dg/graphite/pr42393.f90: Likewise.
>   * gfortran.dg/graphite/pr47019.f: Likewise.

the patch caused a couple of regressions, unfortunately.  E.g.

+FAIL: gfortran.dg/graphite/id-17.f   -O  (internal compiler error)
+FAIL: gfortran.dg/graphite/id-17.f   -O  (test for excess errors)

Excess errors:
during GIMPLE pass: graphite
/vol/gcc/src/hg/trunk/local/gcc/testsuite/gfortran.dg/graphite/id-17.f:1:0: 
internal compiler error: in set_codegen_error, at 
graphite-isl-ast-to-gimple.c:248
0x10d5ebb translate_isl_ast_to_gimple::set_codegen_error()
/vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:247
0x10d0023 translate_isl_ast_to_gimple::set_codegen_error()
/vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:247
0x10d0023 translate_isl_ast_to_gimple::get_rename_from_scev(tree_node*, 
gimple**, loop*, basic_block_def*, basic_block_def*, vec)
/vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:1510
0x10d2963 translate_isl_ast_to_gimple::rename_uses(gimple*, 
gimple_stmt_iterator*, basic_block_def*, loop*, vec)
/vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:1644
0x10d4217 
translate_isl_ast_to_gimple::graphite_copy_stmts_from_block(basic_block_def*, 
basic_block_def*, vec)
/vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:2516
0x10d4653 
translate_isl_ast_to_gimple::copy_bb_and_scalar_dependences(basic_block_def*, 
edge_def*, vec)
/vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:2717
0x10d4ecb 
translate_isl_ast_to_gimple::translate_isl_ast_node_user(isl_ast_node*, 
edge_def*, std::map, 
std::allocator > >&)
/vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:835
0x10d5147 translate_isl_ast_to_gimple::translate_isl_ast_for_loop(loop*, 
isl_ast_node*, edge_def*, tree_node*, tree_node*, tree_node*, std::map, std::allocator > >&)
/vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:649
0x10d53ab translate_isl_ast_to_gimple::translate_isl_ast_node_for(loop*, 
isl_ast_node*, edge_def*, std::map, 
std::allocator > >&)
/vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:752
0x10d5477 translate_isl_ast_to_gimple::translate_isl_ast_node_block(loop*, 
isl_ast_node*, edge_def*, std::map, 
std::allocator > >&)
/vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:864
0x10d5147 translate_isl_ast_to_gimple::translate_isl_ast_for_loop(loop*, 
isl_ast_node*, edge_def*, tree_node*, tree_node*, tree_node*, std::map, std::allocator > >&)
/vol/gcc/src/hg/trunk/local/gcc/graphite-isl-ast-to-gimple.c:649
0x10d53ab translate_isl_ast_to_gimple::translate_isl_ast_node_for(loop*, 
isl_ast_node*, edge_def*, std::map, 

Re: [PATCH][GRAPHITE] Test for code generation errors

2017-10-02 Thread Sebastian Pop
On Mon, Oct 2, 2017 at 4:58 AM, Richard Biener  wrote:
>
> The following patch adjust GRAPHITE testing to check that existing
> code generation issues occur and makes code generation ICE with
> -fchecking --param graphite-allow-codegen-errors=0.  The param
> is really a testsuite artifact so we can have testcases with
> issues where we have papered over GRAPHITE issues with aborting
> code generation.
>
> This avoids regressing testcases that do not show code generation
> issues and it allows detecting testcases that no longer show
> code generation issues (so we can avoid regressing that feat later).
>
> I'm now working on code-generation issues so that's an important
> feature for me.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.
>
> Richard.
>
> 2017-10-02  Richard Biener  
>
> * graphite-isl-ast-to-gimple.c (set_codegen_error): With
> -fchecking and --param graphite-allow-codegen-errors=0 ICE.
> * params.def (PARAM_GRAPHITE_ALLOW_CODEGEN_ERRORS): New param.
>
> * gcc.dg/graphite/graphite.exp: Add -fdump-tree-graphite-details.
> * gcc.dg/graphite/id-16.c: Adjust for existing codegen errors.
> * gcc.dg/graphite/pr46168.c: Likewise.
> * gcc.dg/graphite/pr68756.c: Likewise.
> * gcc.dg/graphite/pr69728.c: Likewise.
> * gcc.dg/graphite/pr71575-2.c: Likewise.
> * gcc.dg/graphite/pr77362.c: Likewise.
> * gcc.dg/graphite/pr81373.c: Likewise.
> * gcc.dg/graphite/run-id-pr67700-1.c: Likewise.
> * gfortran.dg/graphite/interchange-1.f: Likewise.
> * gfortran.dg/graphite/pr29581.f90: Likewise.
> * gfortran.dg/graphite/pr42334-1.f: Likewise.
> * gfortran.dg/graphite/pr42393-1.f90: Likewise.
> * gfortran.dg/graphite/pr42393.f90: Likewise.
> * gfortran.dg/graphite/pr47019.f: Likewise.

Looks good.


[PATCH][GRAPHITE] Test for code generation errors

2017-10-02 Thread Richard Biener

The following patch adjust GRAPHITE testing to check that existing
code generation issues occur and makes code generation ICE with
-fchecking --param graphite-allow-codegen-errors=0.  The param
is really a testsuite artifact so we can have testcases with
issues where we have papered over GRAPHITE issues with aborting
code generation.

This avoids regressing testcases that do not show code generation
issues and it allows detecting testcases that no longer show
code generation issues (so we can avoid regressing that feat later).

I'm now working on code-generation issues so that's an important
feature for me.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.

Richard.

2017-10-02  Richard Biener  

* graphite-isl-ast-to-gimple.c (set_codegen_error): With
-fchecking and --param graphite-allow-codegen-errors=0 ICE.
* params.def (PARAM_GRAPHITE_ALLOW_CODEGEN_ERRORS): New param.

* gcc.dg/graphite/graphite.exp: Add -fdump-tree-graphite-details.
* gcc.dg/graphite/id-16.c: Adjust for existing codegen errors.
* gcc.dg/graphite/pr46168.c: Likewise.
* gcc.dg/graphite/pr68756.c: Likewise.
* gcc.dg/graphite/pr69728.c: Likewise.
* gcc.dg/graphite/pr71575-2.c: Likewise.
* gcc.dg/graphite/pr77362.c: Likewise.
* gcc.dg/graphite/pr81373.c: Likewise.
* gcc.dg/graphite/run-id-pr67700-1.c: Likewise.
* gfortran.dg/graphite/interchange-1.f: Likewise.
* gfortran.dg/graphite/pr29581.f90: Likewise.
* gfortran.dg/graphite/pr42334-1.f: Likewise.
* gfortran.dg/graphite/pr42393-1.f90: Likewise.
* gfortran.dg/graphite/pr42393.f90: Likewise.
* gfortran.dg/graphite/pr47019.f: Likewise.

Index: gcc/graphite-isl-ast-to-gimple.c
===
--- gcc/graphite-isl-ast-to-gimple.c(revision 253336)
+++ gcc/graphite-isl-ast-to-gimple.c(working copy)
@@ -240,7 +240,14 @@ class translate_isl_ast_to_gimple
   void gsi_insert_earliest (gimple_seq seq);
   tree rename_all_uses (tree new_expr, basic_block new_bb, basic_block old_bb);
   bool codegen_error_p () const { return codegen_error; }
-  void set_codegen_error () { codegen_error = true; }
+
+  void set_codegen_error ()
+  {
+codegen_error = true;
+gcc_assert (! flag_checking
+   || PARAM_VALUE (PARAM_GRAPHITE_ALLOW_CODEGEN_ERRORS));
+  }
+
   bool is_constant (tree op) const
   {
 return TREE_CODE (op) == INTEGER_CST
Index: gcc/params.def
===
--- gcc/params.def  (revision 253336)
+++ gcc/params.def  (working copy)
@@ -894,6 +894,12 @@ DEFPARAM (PARAM_MAX_ISL_OPERATIONS,
  "maximum number of isl operations, 0 means unlimited",
  35, 0, 0)
 
+/* For testsuite purposes allow to check for codegen error handling.  */
+DEFPARAM (PARAM_GRAPHITE_ALLOW_CODEGEN_ERRORS,
+ "graphite-allow-codegen-errors",
+ "whether codegen errors should be ICEs when -fchecking.",
+ 0, 0, 1)
+
 /* Avoid data dependence analysis on very large loops.  */
 DEFPARAM (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS,
  "loop-max-datarefs-for-datadeps",
Index: gcc/testsuite/gcc.dg/graphite/graphite.exp
===
--- gcc/testsuite/gcc.dg/graphite/graphite.exp  (revision 253336)
+++ gcc/testsuite/gcc.dg/graphite/graphite.exp  (working copy)
@@ -57,11 +57,11 @@ set vect_files[lsort [glob -noco
 # Tests to be compiled.
 set dg-do-what-default compile
 dg-runtest $scop_files"" "-O2 -fgraphite -fdump-tree-graphite-all"
-dg-runtest $id_files  "" "-O2 -fgraphite-identity -ffast-math"
+dg-runtest $id_files  "" "-O2 -fgraphite-identity -ffast-math 
-fdump-tree-graphite-details"
 
 # Tests to be run.
 set dg-do-what-default run
-dg-runtest $run_id_files  "" "-O2 -fgraphite-identity"
+dg-runtest $run_id_files  "" "-O2 -fgraphite-identity 
-fdump-tree-graphite-details"
 dg-runtest $opt_files "" "-O2 -ffast-math -floop-nest-optimize 
-fdump-tree-graphite-all"
 
 # Vectorizer tests, to be run or compiled, depending on target capabilities.
Index: gcc/testsuite/gcc.dg/graphite/id-16.c
===
--- gcc/testsuite/gcc.dg/graphite/id-16.c   (revision 253336)
+++ gcc/testsuite/gcc.dg/graphite/id-16.c   (working copy)
@@ -1,3 +1,5 @@
+/* { dg-additional-options "--param graphite-allow-codegen-errors=1" } */
+
 int transformation[(2*19 - 1) * (2*19 - 1)][8];
 
 const int transformation2[8][2][2] = {
@@ -42,3 +44,5 @@ transformation_init (void)
}
 }
 }
+
+/* { dg-final { scan-tree-dump-times "code generation error" 1 "graphite" } } 
*/
Index: gcc/testsuite/gcc.dg/graphite/pr46168.c
===
--- gcc/testsuite/gcc.dg/graphite/pr46168.c

Re: [PATCH] [graphite] translate reads and writes in a single traversal of memory ops

2017-10-02 Thread Richard Biener
On Mon, Oct 2, 2017 at 6:53 AM, Sebastian Pop  wrote:
> The patch moves the code that translates reads and writes to isl 
> representation
> in a same loop.  This is to avoid traversing the scop blocks and arrays with
> memory operations 3 times.

LGTM.

Richard.

> * graphite-dependences.c (scop_get_reads): Move code to...
> (scop_get_must_writes): Move code to...
> (scop_get_may_writes): Move code to...
> (scop_get_reads_and_writes): ... here.
> (scop_get_dependences): Call scop_get_reads_and_writes.
> ---
>  gcc/graphite-dependences.c | 78 
> +-
>  1 file changed, 21 insertions(+), 57 deletions(-)
>
> diff --git a/gcc/graphite-dependences.c b/gcc/graphite-dependences.c
> index 4ed9d00..2066b2e 100644
> --- a/gcc/graphite-dependences.c
> +++ b/gcc/graphite-dependences.c
> @@ -63,20 +63,21 @@ add_pdr_constraints (poly_dr_p pdr, poly_bb_p pbb)
>return constrain_domain (x, isl_set_copy (pbb->domain));
>  }
>
> -/* Returns all the memory reads in SCOP.  */
> +/* Returns an isl description of all memory operations in SCOP.  The memory
> +   reads are returned in READS and writes in MUST_WRITES and MAY_WRITES.  */
>
> -static isl_union_map *
> -scop_get_reads (scop_p scop)
> +static void
> +scop_get_reads_and_writes (scop_p scop, isl_union_map *reads,
> +  isl_union_map *must_writes,
> +  isl_union_map *may_writes)
>  {
>int i, j;
>poly_bb_p pbb;
>poly_dr_p pdr;
> -  isl_space *space = isl_set_get_space (scop->param_context);
> -  isl_union_map *res = isl_union_map_empty (space);
>
>FOR_EACH_VEC_ELT (scop->pbbs, i, pbb)
>  {
> -  FOR_EACH_VEC_ELT (PBB_DRS (pbb), j, pdr)
> +  FOR_EACH_VEC_ELT (PBB_DRS (pbb), j, pdr) {
> if (pdr_read_p (pdr))
>   {
> if (dump_file)
> @@ -86,33 +87,14 @@ scop_get_reads (scop_p scop)
>   }
> isl_union_map *um
>   = isl_union_map_from_map (add_pdr_constraints (pdr, pbb));
> -   res = isl_union_map_union (res, um);
> +   reads = isl_union_map_union (reads, um);
> if (dump_file)
>   {
> fprintf (dump_file, "Reads depedence graph: ");
> -   print_isl_union_map (dump_file, res);
> +   print_isl_union_map (dump_file, reads);
>   }
>   }
> -}
> -
> -  return isl_union_map_coalesce (res);
> -}
> -
> -/* Returns all the memory must writes in SCOP.  */
> -
> -static isl_union_map *
> -scop_get_must_writes (scop_p scop)
> -{
> -  int i, j;
> -  poly_bb_p pbb;
> -  poly_dr_p pdr;
> -  isl_space *space = isl_set_get_space (scop->param_context);
> -  isl_union_map *res = isl_union_map_empty (space);
> -
> -  FOR_EACH_VEC_ELT (scop->pbbs, i, pbb)
> -{
> -  FOR_EACH_VEC_ELT (PBB_DRS (pbb), j, pdr)
> -   if (pdr_write_p (pdr))
> +   else if (pdr_write_p (pdr))
>   {
> if (dump_file)
>   {
> @@ -121,33 +103,14 @@ scop_get_must_writes (scop_p scop)
>   }
> isl_union_map *um
>   = isl_union_map_from_map (add_pdr_constraints (pdr, pbb));
> -   res = isl_union_map_union (res, um);
> +   must_writes = isl_union_map_union (must_writes, um);
> if (dump_file)
>   {
> fprintf (dump_file, "Must writes depedence graph: ");
> -   print_isl_union_map (dump_file, res);
> +   print_isl_union_map (dump_file, must_writes);
>   }
>   }
> -}
> -
> -  return isl_union_map_coalesce (res);
> -}
> -
> -/* Returns all the memory may writes in SCOP.  */
> -
> -static isl_union_map *
> -scop_get_may_writes (scop_p scop)
> -{
> -  int i, j;
> -  poly_bb_p pbb;
> -  poly_dr_p pdr;
> -  isl_space *space = isl_set_get_space (scop->param_context);
> -  isl_union_map *res = isl_union_map_empty (space);
> -
> -  FOR_EACH_VEC_ELT (scop->pbbs, i, pbb)
> -{
> -  FOR_EACH_VEC_ELT (PBB_DRS (pbb), j, pdr)
> -   if (pdr_may_write_p (pdr))
> +   else if (pdr_may_write_p (pdr))
>   {
> if (dump_file)
>   {
> @@ -156,16 +119,15 @@ scop_get_may_writes (scop_p scop)
>   }
> isl_union_map *um
>   = isl_union_map_from_map (add_pdr_constraints (pdr, pbb));
> -   res = isl_union_map_union (res, um);
> +   may_writes = isl_union_map_union (may_writes, um);
> if (dump_file)
>   {
> fprintf (dump_file, "May writes depedence graph: ");
> -   print_isl_union_map (dump_file, res);
> +   print_isl_union_map (dump_file, may_writes);
>   }
>   }
> +  }
>  }
> -
> -  return isl_union_map_coalesce (res);
>  }
>
>  /* Helper function used on each MAP of a isl_union_map.  Computes the
> @@ -300,9 +262,11 @@ scop_get_dependences (scop_p 

[PATCH] [graphite] translate reads and writes in a single traversal of memory ops

2017-10-01 Thread Sebastian Pop
The patch moves the code that translates reads and writes to isl representation
in a same loop.  This is to avoid traversing the scop blocks and arrays with
memory operations 3 times.

* graphite-dependences.c (scop_get_reads): Move code to...
(scop_get_must_writes): Move code to...
(scop_get_may_writes): Move code to...
(scop_get_reads_and_writes): ... here.
(scop_get_dependences): Call scop_get_reads_and_writes.
---
 gcc/graphite-dependences.c | 78 +-
 1 file changed, 21 insertions(+), 57 deletions(-)

diff --git a/gcc/graphite-dependences.c b/gcc/graphite-dependences.c
index 4ed9d00..2066b2e 100644
--- a/gcc/graphite-dependences.c
+++ b/gcc/graphite-dependences.c
@@ -63,20 +63,21 @@ add_pdr_constraints (poly_dr_p pdr, poly_bb_p pbb)
   return constrain_domain (x, isl_set_copy (pbb->domain));
 }
 
-/* Returns all the memory reads in SCOP.  */
+/* Returns an isl description of all memory operations in SCOP.  The memory
+   reads are returned in READS and writes in MUST_WRITES and MAY_WRITES.  */
 
-static isl_union_map *
-scop_get_reads (scop_p scop)
+static void
+scop_get_reads_and_writes (scop_p scop, isl_union_map *reads,
+  isl_union_map *must_writes,
+  isl_union_map *may_writes)
 {
   int i, j;
   poly_bb_p pbb;
   poly_dr_p pdr;
-  isl_space *space = isl_set_get_space (scop->param_context);
-  isl_union_map *res = isl_union_map_empty (space);
 
   FOR_EACH_VEC_ELT (scop->pbbs, i, pbb)
 {
-  FOR_EACH_VEC_ELT (PBB_DRS (pbb), j, pdr)
+  FOR_EACH_VEC_ELT (PBB_DRS (pbb), j, pdr) {
if (pdr_read_p (pdr))
  {
if (dump_file)
@@ -86,33 +87,14 @@ scop_get_reads (scop_p scop)
  }
isl_union_map *um
  = isl_union_map_from_map (add_pdr_constraints (pdr, pbb));
-   res = isl_union_map_union (res, um);
+   reads = isl_union_map_union (reads, um);
if (dump_file)
  {
fprintf (dump_file, "Reads depedence graph: ");
-   print_isl_union_map (dump_file, res);
+   print_isl_union_map (dump_file, reads);
  }
  }
-}
-
-  return isl_union_map_coalesce (res);
-}
-
-/* Returns all the memory must writes in SCOP.  */
-
-static isl_union_map *
-scop_get_must_writes (scop_p scop)
-{
-  int i, j;
-  poly_bb_p pbb;
-  poly_dr_p pdr;
-  isl_space *space = isl_set_get_space (scop->param_context);
-  isl_union_map *res = isl_union_map_empty (space);
-
-  FOR_EACH_VEC_ELT (scop->pbbs, i, pbb)
-{
-  FOR_EACH_VEC_ELT (PBB_DRS (pbb), j, pdr)
-   if (pdr_write_p (pdr))
+   else if (pdr_write_p (pdr))
  {
if (dump_file)
  {
@@ -121,33 +103,14 @@ scop_get_must_writes (scop_p scop)
  }
isl_union_map *um
  = isl_union_map_from_map (add_pdr_constraints (pdr, pbb));
-   res = isl_union_map_union (res, um);
+   must_writes = isl_union_map_union (must_writes, um);
if (dump_file)
  {
fprintf (dump_file, "Must writes depedence graph: ");
-   print_isl_union_map (dump_file, res);
+   print_isl_union_map (dump_file, must_writes);
  }
  }
-}
-
-  return isl_union_map_coalesce (res);
-}
-
-/* Returns all the memory may writes in SCOP.  */
-
-static isl_union_map *
-scop_get_may_writes (scop_p scop)
-{
-  int i, j;
-  poly_bb_p pbb;
-  poly_dr_p pdr;
-  isl_space *space = isl_set_get_space (scop->param_context);
-  isl_union_map *res = isl_union_map_empty (space);
-
-  FOR_EACH_VEC_ELT (scop->pbbs, i, pbb)
-{
-  FOR_EACH_VEC_ELT (PBB_DRS (pbb), j, pdr)
-   if (pdr_may_write_p (pdr))
+   else if (pdr_may_write_p (pdr))
  {
if (dump_file)
  {
@@ -156,16 +119,15 @@ scop_get_may_writes (scop_p scop)
  }
isl_union_map *um
  = isl_union_map_from_map (add_pdr_constraints (pdr, pbb));
-   res = isl_union_map_union (res, um);
+   may_writes = isl_union_map_union (may_writes, um);
if (dump_file)
  {
fprintf (dump_file, "May writes depedence graph: ");
-   print_isl_union_map (dump_file, res);
+   print_isl_union_map (dump_file, may_writes);
  }
  }
+  }
 }
-
-  return isl_union_map_coalesce (res);
 }
 
 /* Helper function used on each MAP of a isl_union_map.  Computes the
@@ -300,9 +262,11 @@ scop_get_dependences (scop_p scop)
   if (scop->dependence)
 return;
 
-  isl_union_map *reads = scop_get_reads (scop);
-  isl_union_map *must_writes = scop_get_must_writes (scop);
-  isl_union_map *may_writes = scop_get_may_writes (scop);
+  isl_space *space = isl_set_get_space (scop->param_context);
+  isl_union_map *reads = isl_union_map_empty (isl_space_copy (space));
+  isl_union_map 

Re: isl scheduler and spatial locality (Re: [PATCH][GRAPHITE] More TLC)

2017-10-01 Thread Sven Verdoolaege
On Sat, Sep 30, 2017 at 07:47:43PM +0200, Richard Biener wrote:
> On September 29, 2017 9:58:41 PM GMT+02:00, Sebastian Pop  
> wrote:
> >On Fri, Sep 29, 2017 at 2:37 PM, Sven Verdoolaege
> > wrote:
> >> [Sorry for the resend; I used the wrong email address to CC Alex]
> >>
> >> On Wed, Sep 27, 2017 at 02:18:51PM +0200, Richard Biener wrote:
> >>> Ah, so I now see why we do not perform interchange on trivial cases
> >like
> >>>
> >>> double A[1024][1024], B[1024][1024];
> >>>
> >>> void foo(void)
> >>> {
> >>>   for (int i = 0; i < 1024; ++i)
> >>> for (int j = 0; j < 1024; ++j)
> >>>   A[j][i] = B[j][i];
> >>> }
> >>
> >> I didn't see you mentioning _why_ you expect an interchange here.
> >> Are you prehaps interested in spatial locality?
> >> If so, then there are several approaches for taking
> >> that into account.
> >> - pluto performs an intra-tile loop interchange to
> >>   improve temporal and/or spatial locality.  It shouldn't
> >>   be too hard to do something similar on an isl generated
> >>   schedule
> >> - Alex (Oleksandr) has been working on an extension of
> >>   the isl scheduler that takes into account spatial locality.
> >>   I'm not sure if it's publicly available.
> >> - I've been working on a special case of spatial locality
> >>   (consecutivity).  The current version is available in
> >>   the consecutivity branch.  Note that it may get rebased and
> >>   it may not necessarily get merged into master.
> >>
> >> There are also other approaches, but they may not be that
> >> easy to combine with the isl scheduler.
> >
> >Would the following work?
> >Add to the proximity relation the array accesses from two
> >successive iterations of the innermost loop:
> >A[j][i] -> A[j][i+1] and B[j][i] -> B[j][i+1]
> >With these two extra relations in the proximity map,
> >isl should be able to interchange the above loop.
> 
> Can anyone give a hint on how to do that in ISL terms? 

For the approach pluto is taking, you'll have to look at the source
code, see pluto_intra_tile_optimize_band.
For the other two approaches I mentioned above, reports will
be made available within the next couple of weeks.
For the last one, there is a sample implementation in the
consecutivity branch of PPCG.

skimo


Re: isl scheduler and spatial locality (Re: [PATCH][GRAPHITE] More TLC)

2017-09-30 Thread Richard Biener
On September 29, 2017 9:58:41 PM GMT+02:00, Sebastian Pop  
wrote:
>On Fri, Sep 29, 2017 at 2:37 PM, Sven Verdoolaege
> wrote:
>> [Sorry for the resend; I used the wrong email address to CC Alex]
>>
>> On Wed, Sep 27, 2017 at 02:18:51PM +0200, Richard Biener wrote:
>>> Ah, so I now see why we do not perform interchange on trivial cases
>like
>>>
>>> double A[1024][1024], B[1024][1024];
>>>
>>> void foo(void)
>>> {
>>>   for (int i = 0; i < 1024; ++i)
>>> for (int j = 0; j < 1024; ++j)
>>>   A[j][i] = B[j][i];
>>> }
>>
>> I didn't see you mentioning _why_ you expect an interchange here.
>> Are you prehaps interested in spatial locality?
>> If so, then there are several approaches for taking
>> that into account.
>> - pluto performs an intra-tile loop interchange to
>>   improve temporal and/or spatial locality.  It shouldn't
>>   be too hard to do something similar on an isl generated
>>   schedule
>> - Alex (Oleksandr) has been working on an extension of
>>   the isl scheduler that takes into account spatial locality.
>>   I'm not sure if it's publicly available.
>> - I've been working on a special case of spatial locality
>>   (consecutivity).  The current version is available in
>>   the consecutivity branch.  Note that it may get rebased and
>>   it may not necessarily get merged into master.
>>
>> There are also other approaches, but they may not be that
>> easy to combine with the isl scheduler.
>
>Would the following work?
>Add to the proximity relation the array accesses from two
>successive iterations of the innermost loop:
>A[j][i] -> A[j][i+1] and B[j][i] -> B[j][i+1]
>With these two extra relations in the proximity map,
>isl should be able to interchange the above loop.

Can anyone give a hint on how to do that in ISL terms? 

Richard. 

>Sebastian



Re: isl scheduler and spatial locality (Re: [PATCH][GRAPHITE] More TLC)

2017-09-29 Thread Oleksandr Zinenko



On 29/09/17 21:58, Sebastian Pop wrote:

On Fri, Sep 29, 2017 at 2:37 PM, Sven Verdoolaege
 wrote:

[Sorry for the resend; I used the wrong email address to CC Alex]

On Wed, Sep 27, 2017 at 02:18:51PM +0200, Richard Biener wrote:

Ah, so I now see why we do not perform interchange on trivial cases like

double A[1024][1024], B[1024][1024];

void foo(void)
{
   for (int i = 0; i < 1024; ++i)
 for (int j = 0; j < 1024; ++j)
   A[j][i] = B[j][i];
}

I didn't see you mentioning _why_ you expect an interchange here.
Are you prehaps interested in spatial locality?
If so, then there are several approaches for taking
that into account.
- pluto performs an intra-tile loop interchange to
   improve temporal and/or spatial locality.  It shouldn't
   be too hard to do something similar on an isl generated
   schedule
- Alex (Oleksandr) has been working on an extension of
   the isl scheduler that takes into account spatial locality.
   I'm not sure if it's publicly available.
- I've been working on a special case of spatial locality
   (consecutivity).  The current version is available in
   the consecutivity branch.  Note that it may get rebased and
   it may not necessarily get merged into master.

There are also other approaches, but they may not be that
easy to combine with the isl scheduler.

Would the following work?
Add to the proximity relation the array accesses from two
successive iterations of the innermost loop:
A[j][i] -> A[j][i+1] and B[j][i] -> B[j][i+1]
With these two extra relations in the proximity map,
isl should be able to interchange the above loop.

Sebastian

Hi,

this looks very close to what we do for spatial locality in the 
scheduler, except that we separate proximity and "spatial proximity" 
maps.  There is a couple of caveats in just plugging those into 
proximity, in particular resolving conflicts between spatial and 
temporal locality and unnecessary skewing.


Cheers,
Alex

--
Oleksandr Zinenko,
Inria / École Normale Supérieure,
cont...@ozinenko.com, oleksandr.zine...@inria.fr
https://www.ozinenko.com



Re: isl scheduler and spatial locality (Re: [PATCH][GRAPHITE] More TLC)

2017-09-29 Thread Sebastian Pop
On Fri, Sep 29, 2017 at 2:37 PM, Sven Verdoolaege
 wrote:
> [Sorry for the resend; I used the wrong email address to CC Alex]
>
> On Wed, Sep 27, 2017 at 02:18:51PM +0200, Richard Biener wrote:
>> Ah, so I now see why we do not perform interchange on trivial cases like
>>
>> double A[1024][1024], B[1024][1024];
>>
>> void foo(void)
>> {
>>   for (int i = 0; i < 1024; ++i)
>> for (int j = 0; j < 1024; ++j)
>>   A[j][i] = B[j][i];
>> }
>
> I didn't see you mentioning _why_ you expect an interchange here.
> Are you prehaps interested in spatial locality?
> If so, then there are several approaches for taking
> that into account.
> - pluto performs an intra-tile loop interchange to
>   improve temporal and/or spatial locality.  It shouldn't
>   be too hard to do something similar on an isl generated
>   schedule
> - Alex (Oleksandr) has been working on an extension of
>   the isl scheduler that takes into account spatial locality.
>   I'm not sure if it's publicly available.
> - I've been working on a special case of spatial locality
>   (consecutivity).  The current version is available in
>   the consecutivity branch.  Note that it may get rebased and
>   it may not necessarily get merged into master.
>
> There are also other approaches, but they may not be that
> easy to combine with the isl scheduler.

Would the following work?
Add to the proximity relation the array accesses from two
successive iterations of the innermost loop:
A[j][i] -> A[j][i+1] and B[j][i] -> B[j][i+1]
With these two extra relations in the proximity map,
isl should be able to interchange the above loop.

Sebastian


isl scheduler and spatial locality (Re: [PATCH][GRAPHITE] More TLC)

2017-09-29 Thread Sven Verdoolaege
[Sorry for the resend; I used the wrong email address to CC Alex]

On Wed, Sep 27, 2017 at 02:18:51PM +0200, Richard Biener wrote:
> Ah, so I now see why we do not perform interchange on trivial cases like
> 
> double A[1024][1024], B[1024][1024];
> 
> void foo(void)
> {
>   for (int i = 0; i < 1024; ++i)
> for (int j = 0; j < 1024; ++j)
>   A[j][i] = B[j][i];
> }

I didn't see you mentioning _why_ you expect an interchange here.
Are you prehaps interested in spatial locality?
If so, then there are several approaches for taking
that into account.
- pluto performs an intra-tile loop interchange to
  improve temporal and/or spatial locality.  It shouldn't
  be too hard to do something similar on an isl generated
  schedule
- Alex (Oleksandr) has been working on an extension of
  the isl scheduler that takes into account spatial locality.
  I'm not sure if it's publicly available.
- I've been working on a special case of spatial locality
  (consecutivity).  The current version is available in
  the consecutivity branch.  Note that it may get rebased and
  it may not necessarily get merged into master.

There are also other approaches, but they may not be that
easy to combine with the isl scheduler.

skimo


isl scheduler and spatial locality (Re: [PATCH][GRAPHITE] More TLC)

2017-09-29 Thread Sven Verdoolaege
On Wed, Sep 27, 2017 at 02:18:51PM +0200, Richard Biener wrote:
> Ah, so I now see why we do not perform interchange on trivial cases like
> 
> double A[1024][1024], B[1024][1024];
> 
> void foo(void)
> {
>   for (int i = 0; i < 1024; ++i)
> for (int j = 0; j < 1024; ++j)
>   A[j][i] = B[j][i];
> }

I didn't see you mentioning _why_ you expect an interchange here.
Are you prehaps interested in spatial locality?
If so, then there are several approaches for taking
that into account.
- pluto performs an intra-tile loop interchange to
  improve temporal and/or spatial locality.  It shouldn't
  be too hard to do something similar on an isl generated
  schedule
- Alex (Oleksandr) has been working on an extension of
  the isl scheduler that takes into account spatial locality.
  I'm not sure if it's publicly available.
- I've been working on a special case of spatial locality
  (consecutivity).  The current version is available in
  the consecutivity branch.  Note that it may get rebased and
  it may not necessarily get merged into master.

There are also other approaches, but they may not be that
easy to combine with the isl scheduler.

skimo


Re: [PATCH][GRAPHITE] More TLC

2017-09-29 Thread Sebastian Pop
On Fri, Sep 29, 2017 at 6:17 AM, Richard Biener  wrote:
> I fixed the "hack patch" somewhat but realized it's not really possible
> ATM to get back at this form because the array descriptor contains only
> information to generate the linearized form.  So while I get now correct
> values they end up with runtime divisions which aren't handled by
> SCEV.

You are right, SCEV has some limits on representing and folding
those division expressions.

There is a proposal in LLVM from Johannes Doerfert
https://reviews.llvm.org/D38255
to use isl as a representation and expression folder instead of the
chains of recurrences for the scalar evolution analysis.  isl would
be able to handle some of the semantics of the div_exprs, and the
semantics of wrap-around variables, and of course it would have
some other limits to represent multiplications (as we spoke about
yesterday, i * N or M * N for example,) and thus that polyhedral
analysis would need to rely on the delinearization of array indices.


Re: [PATCH][GRAPHITE] Abstract away codegen_error setting

2017-09-29 Thread Sebastian Pop
On Fri, Sep 29, 2017 at 3:52 AM, Richard Biener  wrote:
>
> This moves it to a function to make it easy to enable ICEin on them
> in one place.
>
> Bootstrapped / tested on x86_64-unknown-linux-gnu, applied.
>
> Richard.
>
> 2017-09-29  Richard Biener  
>
> * graphite-isl-ast-to-gimple.c
> (translate_isl_ast_to_gimple::set_codegen_error): New function.
> (binary_op_to_tree): Use it.
> (get_rename_from_scev): Likewise.
> (copy_loop_phi_nodes): Likewise.
> (copy_bb_and_scalar_dependences): Likewise.
> (translate_pending_phi_nodes): Likewise.

Looks good.  Thanks.


Re: [PATCH][GRAPHITE] Avoid CHRECs with evolution in loops not in the nest

2017-09-29 Thread Sebastian Pop
On Fri, Sep 29, 2017 at 6:18 AM, Richard Biener  wrote:
> The idea is that we'd transform the above to
> basically wrap each SCOP inside a loop that doesn't iterate.
>
> Does this look reasonable?

Yes, I think your solution looks good.

> 2017-09-29  Richard Biener  
>
> PR tree-optimization/82355
> * graphite-isl-ast-to-gimple.c (build_iv_mapping): Also build
> a mapping for the enclosing loop but avoid generating one for
> the loop tree root.
> (copy_bb_and_scalar_dependences): Remove premature codegen
> error on PHIs in blocks duplicated into multiple places.
> * graphite-scop-detection.c
> (scop_detection::stmt_has_simple_data_refs_p): For a loop not
> in the region use it as loop and nest to analyze the DR in.
> (try_generate_gimple_bb): Likewise.
> * graphite-sese-to-poly.c (extract_affine_chrec): Adjust.
> (add_loop_constraints): For blocks in a loop not in the region
> create a dimension with a single iteration.
> * sese.h (gbb_loop_at_index): Remove assert.
>
> * gcc.dg/graphite/fuse-1.c: Adjust.
> * gcc.dg/graphite/fuse-2.c: Likewise.
> * gcc.dg/graphite/pr82355.c: New testcase.

The change looks good to me.  Thank you!


Re: [PATCH][GRAPHITE] Avoid CHRECs with evolution in loops not in the nest

2017-09-29 Thread Richard Biener
On Fri, 29 Sep 2017, Richard Biener wrote:

> 
> For gcc.dg/graphite/scop-4.c when we analyze data-refs of the fist
> two inner loops (with the scalar BB in between)
> 
>   for (i = 1; i < 100; i++) /// loop 1
> {
> -- scop start
>   for (j = 1; j < 80; j++)  /// loop 2
> a[j][i] = a[j+1][2*i-1*j] + 12;
> 
>   b[i] = b[i-1] + 10;
> 
>   for (j = 1; j < 60; j++)  /// loop 3
> a[j][i] = a[j+1][i-1] + 8;
> -- scop end
> 
>   bar ();
> 
>   if (i == 23)
> b[i] = a[i-1][i] + 6;
> }
> 
> we end up asking data-ref analysis to analyze b[i] and b[i-1] with
> respect to loop == nest == 2.  That doesn't make much sense in
> itself and thus we get some weird answers.  The one that confuses
> us is {0, +, 1}_1 because later when we try to extract_affine_chrec
> on that ISL aborts because we feed it -1 as the dimension for the
> loop.
> 
> While we can with some effort suppress those CHRECs I think the
> better solution is to make GRAPHITE not ask SCEV to do this
> kind of analysis but consistently analyze DRs in the containing
> loop even if that loop is not within the current region.
> 
> The idea is that we'd transform the above to
> 
>   for (i = 1; i < 100; i++) /// loop 1
> {
> -- scop start
>  do
>   {
>   i' = i;
> 
>   for (j = 1; j < 80; j++)  /// loop 2
> a[j][i'] = a[j+1][2*i'-1*j] + 12;
> 
>   b[i'] = b[i'-1] + 10;
> 
>   for (j = 1; j < 60; j++)  /// loop 3
> a[j][i'] = a[j+1][i'-1] + 8;
>   }
>  while (0);
> -- scop end
> 
>   bar ();
> 
>   if (i == 23)
> b[i] = a[i-1][i] + 6;
> }
> 
> basically wrap each SCOP inside a loop that doesn't iterate.
> 
> This should make analyzing the data-refs possible and enable
> dependence analysis for them.
> 
> I needed to create an extra dimension for this "loop" and
> adjust some +- 1 stuff (ugh, this can need some better abstraction...).
> 
> On the way the stmt_has_simple_data_refs_p/try_generate_gimple_bb
> changes fix PR82355 as well.
> 
> The copy_bb_and_scalar_dependences change is to avoid extra
> code-gen errors on some existing graphite testcases where we now
> perform more elaborate transforms.
> 
> 
> I probably started with one of the more difficult code-gen errors
> here so bear with my lack of GRAPHITE knowledge.
> 
> Does this look reasonable?
> 
> Bootstrapped and tested on x86_64-unknown-linux-gnu (with & without
> -floop-nest-optimize -fgraphite-identity).
> 
> I've verified that the graphite testsuite now has less ICEs when
> turning code-gen errors into ICEs.
> 
> Results of throwing this onto SPEC CPU 2006 are somewhat hard to
> interpret.  Before the patch we see
> 
> loop nest optimized: 105
> loop nest not optimized, code generation error: 42
> loop nest not optimized, optimized schedule is identical to original 
> schedule: 119
> loop nest not optimized, optimization timed out: 36
> loop nest not optimized, ISL signalled an error: 6
> loop nest: 308
> 
> and after
> 
> loop nest optimized: 87
> loop nest not optimized, code generation error: 93
> loop nest not optimized, optimized schedule is identical to original 
> schedule: 216
> loop nest not optimized, optimization timed out: 46
> loop nest not optimized, ISL signalled an error: 8
> loop nest: 450
> 
> first of all this means we've rejected fewer SCOPs initially
> (very likely because of alias-sets now matching and maybe because
> asking SCEV for sth sensible turns out to result in less
> scev_not_known stuff in affected DRs).  I'm not sure why
> we end up optimizing less (maybe because I don't analyze DRs
> in loops contained in the region with respect to the containing loop
> as well -- I'd have to pre-compute that loop).

So fixing the build_iv_mapping hunk to

  /* Record sth only for real loops.  */
  if (loop_in_sese_p (old_loop, region))
iv_map[old_loop->num] = t;

changes statistics to a favorable

loop nest optimized: 134
loop nest not optimized, code generation error: 46
loop nest not optimized, optimized schedule is identical to original 
schedule: 216
loop nest not optimized, optimization timed out: 46
loop nest not optimized, ISL signalled an error: 8
loop nest: 450

Consider the patch changed that way.

I guess I'll add a -fgraphite-no-codegen-error to force ICEing on them
and enable that on the graphite.exp testcases while explicitely
sticking on a -fno-graphite-no-codegen-error on those who'll ICE
and scan for a codegen error message on them.

That makes it easier to avoid regressing in that area.

Richard.

> Anyway.
> 
> Ok for trunk?
> 
> Thanks,
> Richard.
> 
> 2017-09-29  Richard Biener  
> 
>   PR tree-optimization/82355
>   * graphite-isl-ast-to-gimple.c (build_iv_mapping): Also build
>   a mapping for the enclosing loop but avoid generating one for
>   the loop tree root.
>   (copy_bb_and_scalar_dependences): Remove premature codegen
>   error on PHIs in blocks duplicated into 

[PATCH][GRAPHITE] Avoid CHRECs with evolution in loops not in the nest

2017-09-29 Thread Richard Biener

For gcc.dg/graphite/scop-4.c when we analyze data-refs of the fist
two inner loops (with the scalar BB in between)

  for (i = 1; i < 100; i++) /// loop 1
{
-- scop start
  for (j = 1; j < 80; j++)  /// loop 2
a[j][i] = a[j+1][2*i-1*j] + 12;

  b[i] = b[i-1] + 10;

  for (j = 1; j < 60; j++)  /// loop 3
a[j][i] = a[j+1][i-1] + 8;
-- scop end

  bar ();

  if (i == 23)
b[i] = a[i-1][i] + 6;
}

we end up asking data-ref analysis to analyze b[i] and b[i-1] with
respect to loop == nest == 2.  That doesn't make much sense in
itself and thus we get some weird answers.  The one that confuses
us is {0, +, 1}_1 because later when we try to extract_affine_chrec
on that ISL aborts because we feed it -1 as the dimension for the
loop.

While we can with some effort suppress those CHRECs I think the
better solution is to make GRAPHITE not ask SCEV to do this
kind of analysis but consistently analyze DRs in the containing
loop even if that loop is not within the current region.

The idea is that we'd transform the above to

  for (i = 1; i < 100; i++) /// loop 1
{
-- scop start
 do
  {
  i' = i;

  for (j = 1; j < 80; j++)  /// loop 2
a[j][i'] = a[j+1][2*i'-1*j] + 12;

  b[i'] = b[i'-1] + 10;

  for (j = 1; j < 60; j++)  /// loop 3
a[j][i'] = a[j+1][i'-1] + 8;
  }
 while (0);
-- scop end

  bar ();

  if (i == 23)
b[i] = a[i-1][i] + 6;
}

basically wrap each SCOP inside a loop that doesn't iterate.

This should make analyzing the data-refs possible and enable
dependence analysis for them.

I needed to create an extra dimension for this "loop" and
adjust some +- 1 stuff (ugh, this can need some better abstraction...).

On the way the stmt_has_simple_data_refs_p/try_generate_gimple_bb
changes fix PR82355 as well.

The copy_bb_and_scalar_dependences change is to avoid extra
code-gen errors on some existing graphite testcases where we now
perform more elaborate transforms.


I probably started with one of the more difficult code-gen errors
here so bear with my lack of GRAPHITE knowledge.

Does this look reasonable?

Bootstrapped and tested on x86_64-unknown-linux-gnu (with & without
-floop-nest-optimize -fgraphite-identity).

I've verified that the graphite testsuite now has less ICEs when
turning code-gen errors into ICEs.

Results of throwing this onto SPEC CPU 2006 are somewhat hard to
interpret.  Before the patch we see

loop nest optimized: 105
loop nest not optimized, code generation error: 42
loop nest not optimized, optimized schedule is identical to original 
schedule: 119
loop nest not optimized, optimization timed out: 36
loop nest not optimized, ISL signalled an error: 6
loop nest: 308

and after

loop nest optimized: 87
loop nest not optimized, code generation error: 93
loop nest not optimized, optimized schedule is identical to original 
schedule: 216
loop nest not optimized, optimization timed out: 46
loop nest not optimized, ISL signalled an error: 8
loop nest: 450

first of all this means we've rejected fewer SCOPs initially
(very likely because of alias-sets now matching and maybe because
asking SCEV for sth sensible turns out to result in less
scev_not_known stuff in affected DRs).  I'm not sure why
we end up optimizing less (maybe because I don't analyze DRs
in loops contained in the region with respect to the containing loop
as well -- I'd have to pre-compute that loop).

Anyway.

Ok for trunk?

Thanks,
Richard.

2017-09-29  Richard Biener  

PR tree-optimization/82355
* graphite-isl-ast-to-gimple.c (build_iv_mapping): Also build
a mapping for the enclosing loop but avoid generating one for
the loop tree root.
(copy_bb_and_scalar_dependences): Remove premature codegen
error on PHIs in blocks duplicated into multiple places.
* graphite-scop-detection.c
(scop_detection::stmt_has_simple_data_refs_p): For a loop not
in the region use it as loop and nest to analyze the DR in.
(try_generate_gimple_bb): Likewise.
* graphite-sese-to-poly.c (extract_affine_chrec): Adjust.
(add_loop_constraints): For blocks in a loop not in the region
create a dimension with a single iteration.
* sese.h (gbb_loop_at_index): Remove assert.

* gcc.dg/graphite/fuse-1.c: Adjust.
* gcc.dg/graphite/fuse-2.c: Likewise.
* gcc.dg/graphite/pr82355.c: New testcase.

Index: gcc/graphite-isl-ast-to-gimple.c
===
--- gcc/graphite-isl-ast-to-gimple.c(revision 253282)
+++ gcc/graphite-isl-ast-to-gimple.c(working copy)
@@ -774,8 +775,10 @@ build_iv_mapping (vec iv_map, gimp
   if (codegen_error_p ())
t = integer_zero_node;
 
-  loop_p old_loop = gbb_loop_at_index (gbb, region, i - 1);
-  iv_map[old_loop->num] = t;
+  loop_p old_loop = gbb_loop_at_index (gbb, region, i - 2);
+

Re: [PATCH][GRAPHITE] More TLC

2017-09-29 Thread Richard Biener
On Thu, 28 Sep 2017, Sebastian Pop wrote:

> On Wed, Sep 27, 2017 at 9:33 AM, Richard Biener  wrote:
> > Looks like even when hacking the Fortran FE to produce nested
> > ARRAY_REFs we run into the same issue for
> >
> > (gdb) p debug_data_reference (dr)
> > #(Data Ref:
> > #  bb: 17
> > #  stmt:
> > VIEW_CONVERT_EXPR(*y_117(D))[_24]{lb:
> > 1 sz: _20 * 8}[_26]{lb: 1 sz: _21 * 8}[_28]{lb: 1 sz: _22 * 8}[_29]{lb: 1
> > sz: 8} = 0.0;
> > #  ref:
> > VIEW_CONVERT_EXPR(*y_117(D))[_24]{lb:
> > 1 sz: _20 * 8}[_26]{lb: 1 sz: _21 * 8}[_28]{lb: 1 sz: _22 * 8}[_29]{lb: 1
> > sz: 8};
> > #  base_object:
> > VIEW_CONVERT_EXPR(*y_117(D));
> > #  Access function 0: {1, +, 1}_4
> > #  Access function 1: (integer(kind=8)) {(unsigned long) stride.88_92, +,
> > (unsigned long) stride.88_92}_3;
> > #  Access function 2: (integer(kind=8)) {(unsigned long) stride.90_96, +,
> > (unsigned long) stride.90_96}_2;
> > #  Access function 3: (integer(kind=8)) {(unsigned long) stride.92_100, +,
> > (unsigned long) stride.92_100}_1;
> >
> > so it looks like simple strided (where stride is a parameter) access
> > is not handled either.
> 
> Yes, this is the first option I was mentioning: it could work,
> could you please make sure that you don't have a bug in the "hack patch"
> where the outer dimension should not contain the parameter
> (inner array dimension) times the access function.
> 
> Example in C:
> int A[100][N];
> A[i][j] is linearized as *(A + i * N * 4 + j * 4)
> and you may have a bug if you delinearized it in the Fortran FE as A[i * N][j]
> Could you please check that it would delinearize back to A[i][j]?

I fixed the "hack patch" somewhat but realized it's not really possible
ATM to get back at this form because the array descriptor contains only
information to generate the linearized form.  So while I get now correct
values they end up with runtime divisions which aren't handled by
SCEV.

I fear emitting delinearized code from the Fortran FE would be an
ABI breakage as we'd have to change the array descriptor contents.

> >
> > GCCs dependence analysis can at least compute distances of two
> > DRs when the difference of the access CHRECs is constant.  Within
> > the polyhedral model those cases cannot be handled?
> 
> The difficulty for the polyhedral model is in the representation
> of a multiplication of parameter times loop index variable.
> The delinearization removes these difficulties by creating linear expressions.
> 
> Think about multiplication as something introducing exponentiality
> and you realize that any such expression would not fit in the
> linear model of polyhedra.
> A parameter is nothing else than an outer loop index to which we don't
> have access to that loop level as it may be outside the current function
> in which we get that parameter in.

Yeah, I see that now.

Richard.


Re: [PATCH][GRAPHITE] Allow --param graphite-max-arrays-per-scop=0

2017-09-29 Thread Richard Biener
On Thu, 28 Sep 2017, Sebastian Pop wrote:

> On Wed, Sep 27, 2017 at 6:51 AM, Richard Biener  wrote:
> >
> > The following is to allow making --param graphite-max-arrays-per-scop
> > unbounded.  That's a little tricky because the bound is used when
> > computing "alias-sets" for scalar constraints.  There's an easy way
> > out though as we know the maximum alias-set assigned in the SCOP,
> > we only have to remember it.  The advantage (if it matters at all)
> > is that we avoid a constraint coefficient gap between that last
> > used alias-set and the former PARAM_GRAPHITE_MAX_ARRAYS_PER_SCOP.
> >
> > Bootstrap and regtest running on x86_64-unknown-linux-gnu, SPEC CPU 2006
> > tested.  Will apply after testing finished.
> >
> > Richard.
> >
> > 2017-09-27  Richard Biener  
> >
> > * graphite.h (scop::max_alias_set): New member.
> > * graphite-scop-detection.c: Remove references to non-existing
> > --param in comments.
> > (build_alias_sets): Record the maximum alias set used for drs.
> > (build_scops): Support zero as unlimited for
> > --param graphite-max-arrays-per-scop.
> > * graphite-sese-to-poly.c (add_scalar_version_numbers): Remove
> > and inline into ...
> > (build_poly_sr_1): ... here.  Compute alias set based on the
> > maximum alias set used for drs rather than
> > PARAM_GRAPHITE_MAX_ARRAYS_PER_SCOP
> >
> 
> Maybe we should keep this limit, and instead of failing to handle
> huge scops, we could stop the scop detection to expand the
> scop past this limit?

Yes, I believe we should put all the checks we do to discard SCOPs
at SESE region build time and try if smaller regions would fit within
the limits.

Of course it's hard to guess whether ISL will eventually time-out or 
not...

Another check that kills quite many SCOPs in the end is verifying
whether we can handle the dependences - unfortunately that one is
necessarily quadratic in the number of DRs... (hacking this away
as in if we'd do versioning on a proper condition gets us 10 times
more SCOPs in SPEC CPU 2006 but only 50 more transforms because
code-gen errors sky-rocket).

I'm going to leave the limits in place right now as I'm shifting
towards fixing existing code-gen issues at the moment.

Richard.


[PATCH][GRAPHITE] Abstract away codegen_error setting

2017-09-29 Thread Richard Biener

This moves it to a function to make it easy to enable ICEin on them
in one place.

Bootstrapped / tested on x86_64-unknown-linux-gnu, applied.

Richard.

2017-09-29  Richard Biener  

* graphite-isl-ast-to-gimple.c
(translate_isl_ast_to_gimple::set_codegen_error): New function.
(binary_op_to_tree): Use it.
(get_rename_from_scev): Likewise.
(copy_loop_phi_nodes): Likewise.
(copy_bb_and_scalar_dependences): Likewise.
(translate_pending_phi_nodes): Likewise.

Index: gcc/graphite-isl-ast-to-gimple.c
===
--- gcc/graphite-isl-ast-to-gimple.c(revision 253249)
+++ gcc/graphite-isl-ast-to-gimple.c(working copy)
@@ -240,6 +240,7 @@ class translate_isl_ast_to_gimple
   void gsi_insert_earliest (gimple_seq seq);
   tree rename_all_uses (tree new_expr, basic_block new_bb, basic_block old_bb);
   bool codegen_error_p () const { return codegen_error; }
+  void set_codegen_error () { codegen_error = true; }
   bool is_constant (tree op) const
   {
 return TREE_CODE (op) == INTEGER_CST
@@ -347,7 +348,7 @@ binary_op_to_tree (tree type, __isl_take
 division by 2^64 that is folded to 0.  */
   if (integer_zerop (tree_rhs_expr))
{
- codegen_error = true;
+ set_codegen_error ();
  return NULL_TREE;
}
   return fold_build2 (EXACT_DIV_EXPR, type, tree_lhs_expr, tree_rhs_expr);
@@ -357,7 +358,7 @@ binary_op_to_tree (tree type, __isl_take
 division by 2^64 that is folded to 0.  */
   if (integer_zerop (tree_rhs_expr))
{
- codegen_error = true;
+ set_codegen_error ();
  return NULL_TREE;
}
   return fold_build2 (TRUNC_DIV_EXPR, type, tree_lhs_expr, tree_rhs_expr);
@@ -368,7 +369,7 @@ binary_op_to_tree (tree type, __isl_take
 division by 2^64 that is folded to 0.  */
   if (integer_zerop (tree_rhs_expr))
{
- codegen_error = true;
+ set_codegen_error ();
  return NULL_TREE;
}
   return fold_build2 (TRUNC_MOD_EXPR, type, tree_lhs_expr, tree_rhs_expr);
@@ -378,7 +379,7 @@ binary_op_to_tree (tree type, __isl_take
 division by 2^64 that is folded to 0.  */
   if (integer_zerop (tree_rhs_expr))
{
- codegen_error = true;
+ set_codegen_error ();
  return NULL_TREE;
}
   return fold_build2 (FLOOR_DIV_EXPR, type, tree_lhs_expr, tree_rhs_expr);
@@ -1497,7 +1498,7 @@ get_rename_from_scev (tree old_name, gim
   tree new_expr;
   if (chrec_contains_undetermined (scev))
 {
-  codegen_error = true;
+  set_codegen_error ();
   return build_zero_cst (TREE_TYPE (old_name));
 }
 
@@ -1510,7 +1511,7 @@ get_rename_from_scev (tree old_name, gim
   if (chrec_contains_undetermined (new_expr)
   || tree_contains_chrecs (new_expr, NULL))
 {
-  codegen_error = true;
+  set_codegen_error ();
   return build_zero_cst (TREE_TYPE (old_name));
 }
 
@@ -1519,7 +1520,7 @@ get_rename_from_scev (tree old_name, gim
   basic_block bb = gimple_bb (SSA_NAME_DEF_STMT (new_expr));
   if (bb && !dominated_by_p (CDI_DOMINATORS, new_bb, bb))
{
- codegen_error = true;
+ set_codegen_error ();
  return build_zero_cst (TREE_TYPE (old_name));
}
 }
@@ -1539,7 +1540,7 @@ get_rename_from_scev (tree old_name, gim
  basic_block bb = gimple_bb (SSA_NAME_DEF_STMT (new_ssa_name));
  if (bb && !dominated_by_p (CDI_DOMINATORS, new_bb, bb))
{
- codegen_error = true;
+ set_codegen_error ();
  return build_zero_cst (TREE_TYPE (old_name));
}
}
@@ -1820,8 +1822,8 @@ copy_loop_phi_nodes (basic_block bb, bas
   tree new_res = create_new_def_for (res, new_phi,
 gimple_phi_result_ptr (new_phi));
   set_rename (res, new_res);
-  codegen_error = !copy_loop_phi_args (phi, ibp_old_bb, new_phi,
-  ibp_new_bb, true);
+  if (!copy_loop_phi_args (phi, ibp_old_bb, new_phi, ibp_new_bb, true))
+   set_codegen_error ();
   update_stmt (new_phi);
 
   if (dump_file)
@@ -2593,7 +2595,7 @@ copy_bb_and_scalar_dependences (basic_bl
 outside the region.  */
   if (num_phis)
{
- codegen_error = true;
+ set_codegen_error ();
  return NULL;
}
 }
@@ -2608,7 +2610,7 @@ copy_bb_and_scalar_dependences (basic_bl
   edge e = edge_for_new_close_phis (bb);
   if (!e)
{
- codegen_error = true;
+ set_codegen_error ();
  return NULL;
}
 
@@ -2622,7 +2624,7 @@ copy_bb_and_scalar_dependences (basic_bl
 
   if (!copy_loop_close_phi_nodes (bb, phi_bb, iv_map))
{
- codegen_error = true;
+ set_codegen_error ();
  return NULL;
}
 
@@ -2644,7 

Re: [PATCH][GRAPHITE] Make --param loop-block-tile-size=0 disable tiling

2017-09-28 Thread Sebastian Pop
On Wed, Sep 27, 2017 at 7:20 AM, Richard Biener  wrote:
>
> Currently ISL aborts on this special value and for debugging (and
> tuning?) it's nice to avoid all the clutter introduced by tiling.
>
> Committed as obvious.
>
> Richard.
>
> 2017-09-27  Richard Biener  
>
> * graphite-optimize-isl.c (get_schedule_for_node_st): Allow
> --param loop-block-tile-size=0 to disable tiling.

Looks good.


Re: [PATCH][GRAPHITE] Allow --param graphite-max-arrays-per-scop=0

2017-09-28 Thread Sebastian Pop
On Wed, Sep 27, 2017 at 6:51 AM, Richard Biener  wrote:
>
> The following is to allow making --param graphite-max-arrays-per-scop
> unbounded.  That's a little tricky because the bound is used when
> computing "alias-sets" for scalar constraints.  There's an easy way
> out though as we know the maximum alias-set assigned in the SCOP,
> we only have to remember it.  The advantage (if it matters at all)
> is that we avoid a constraint coefficient gap between that last
> used alias-set and the former PARAM_GRAPHITE_MAX_ARRAYS_PER_SCOP.
>
> Bootstrap and regtest running on x86_64-unknown-linux-gnu, SPEC CPU 2006
> tested.  Will apply after testing finished.
>
> Richard.
>
> 2017-09-27  Richard Biener  
>
> * graphite.h (scop::max_alias_set): New member.
> * graphite-scop-detection.c: Remove references to non-existing
> --param in comments.
> (build_alias_sets): Record the maximum alias set used for drs.
> (build_scops): Support zero as unlimited for
> --param graphite-max-arrays-per-scop.
> * graphite-sese-to-poly.c (add_scalar_version_numbers): Remove
> and inline into ...
> (build_poly_sr_1): ... here.  Compute alias set based on the
> maximum alias set used for drs rather than
> PARAM_GRAPHITE_MAX_ARRAYS_PER_SCOP
>

Maybe we should keep this limit, and instead of failing to handle
huge scops, we could stop the scop detection to expand the
scop past this limit?


Re: [PATCH][GRAPHITE] Remove another small quadraticness

2017-09-28 Thread Sebastian Pop
On Wed, Sep 27, 2017 at 6:48 AM, Richard Biener  wrote:
>
> Turns out loop_nest recorded in scop-info isn't really necessary as
> we can simply process parameters in loop bounds during the gather_bbs
> walk where we encounter each loop (identified by its header) once.
>
> This avoids the linear search in record_loop_in_sese.
>
> Bootstrap / regtest running on x86_64-unknown-linux-gnu, will apply.
>
> Richard.
>
> 2017-09-27  Richard Biener  
>
> * graphite-scop-detection.c (find_scop_parameters): Move
> loop bound handling ...
> (gather_bbs::before_dom_children): ... here, avoiding the need
> to build scop_info->loop_nest.
> (record_loop_in_sese): Remove.
> * sese.h (sese_info_t::loop_nest): Remove.
> * sese.c (new_sese_info): Do not allocate loop_nest.
> (free_sese_info): Do not free loop_nest.

Looks good.  Thanks!


Re: [PATCH][GRAPHITE] Speedup SCOP detection some more, add region handling to domwalk

2017-09-28 Thread Sebastian Pop
On Wed, Sep 27, 2017 at 6:07 AM, Richard Biener  wrote:
>  /* Maximal number of array references in a scop.  */
>
DEFPARAM (PARAM_GRAPHITE_MAX_ARRAYS_PER_SCOP,
  "graphite-max-arrays-per-scop",
  "maximum number of arrays per scop.",
  100, 0, 0)

Let's also remove this param as we now have max-isl-operations.

Thanks,
Sebastian


Re: [PATCH][GRAPHITE] Speedup SCOP detection some more, add region handling to domwalk

2017-09-28 Thread Sebastian Pop
On Wed, Sep 27, 2017 at 6:07 AM, Richard Biener  wrote:
>
> This removes another quadraticness from SCOP detection, gather_bbs
> domwalk.  This is done by enhancing domwalk to handle SEME regions
> via a special return value from before_dom_children.
>
> With this I'm now confident to remove the
> PARAM_GRAPHITE_MAX_BBS_PER_FUNCTION parameter and its associated limit.
> Being there I've adjusted PARAM_GRAPHITE_MAX_NB_SCOP_PARAMS to its
> documented default value which enables 90 more loos to be processed
> in SPEC CPU 2006.  I've also made a value of zero magic in disabling
> the limit (a trick commonly used in GCC).
>
> Statistics I have gathered a few patches before for SPEC CPU 2006:
>
> 1255 multi-loop SESEs in SCOP processing
> max. params 34, 3 scops >= 20, 15 scops >= 10, 33 scops >= 8
> max. drs per scop 869, 10 scops >= 100
> max. pbbs per scop 36, 12 scops >= 10
> 919 SCOPs fail in build_alias_sets
>
> which shows the default for PARAM_GRAPHITE_MAX_ARRAYS_PER_SCOP
> is reasonable (if tuned to SPEC CPU 2006).
>
> I've also included the hunk that allows -fgraphite-identity
> to work ontop of -floop-nest-optimize and for -floop-nest-optimize
> -ftree-parallelize-all also make sure to code-gen loops that
> end up not transformed.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, SPEC CPU 2006
> tested, applied to trunk.
>
> Richard.
>
> 2017-09-27  Richard Biener  
>
> * doc/invoke.texi (graphite-max-bbs-per-function): Remove.
> (graphite-max-nb-scop-params): Document special value zero.
> * domwalk.h (dom_walker::STOP): New symbolical constant.
> (dom_walker::dom_walker): Add optional parameter for bb to
> RPO mapping.
> (dom_walker::~dom_walker): Declare.
> (dom_walker::before_dom_children): Document STOP return value.
> (dom_walker::m_user_bb_to_rpo): New member.
> (dom_walker::m_bb_to_rpo): Likewise.
> * domwalk.c (dom_walker::dom_walker): Compute bb to RPO
> mapping here if not provided by the user.
> (dom_walker::~dom_walker): Free bb to RPO mapping if not
> provided by the user.
> (dom_walker::STOP): Define.
> (dom_walker::walk): Do not compute bb to RPO mapping here.
> Support STOP return value from before_dom_children to stop
> walking.
> * graphite-optimize-isl.c (optimize_isl): If the schedule
> is the same still generate code if -fgraphite-identity
> or -floop-parallelize-all are given.
> * graphite-scop-detection.c: Include cfganal.h.
> (gather_bbs::gather_bbs): Get and pass through bb to RPO
> mapping.
> (gather_bbs::before_dom_children): Return STOP for BBs
> not in the region.
> (build_scops): Compute bb to RPO mapping and pass it to
> the domwalk.  Treat --param graphite-max-nb-scop-params=0
> as not limiting the number of params.
> * graphite.c (graphite_initialize): Remove limit on the
> number of basic-blocks in a function.
> * params.def (PARAM_GRAPHITE_MAX_BBS_PER_FUNCTION): Remove.
> (PARAM_GRAPHITE_MAX_NB_SCOP_PARAMS): Adjust to documented
> default value of 10.

The patch looks good.  Thanks!

>
> Index: gcc/doc/invoke.texi
> ===
> --- gcc/doc/invoke.texi (revision 253224)
> +++ gcc/doc/invoke.texi (working copy)
> @@ -10512,13 +10512,9 @@ sequence pairs.  This option only applie
>  @item graphite-max-nb-scop-params
>  To avoid exponential effects in the Graphite loop transforms, the
>  number of parameters in a Static Control Part (SCoP) is bounded.  The
> -default value is 10 parameters.

Now that we have "compute-out" functionality in all supported
versions of isl, let's remove this parameter.

We needed this in the past when isl was not able to stop an
exponential computation, and that happened when operating
on large dimension spaces.


Re: [PATCH][GRAPHITE] Simplify SCOP detection

2017-09-28 Thread Sebastian Pop
On Wed, Sep 27, 2017 at 2:21 AM, Richard Biener  wrote:
> On Tue, 26 Sep 2017, Sebastian Pop wrote:
>
>> On Tue, Sep 26, 2017 at 7:03 AM, Richard Biener  wrote:
>>
>> >
>> > The following is the result of me trying to understand SCOP detection
>> > and the validity checks spread around the machinery.  It removes several
>> > quadraticnesses by folding validity checks into
>> > scop_detection::harmful_loop_in_region where we already walk over all
>> > BBs in the region and process individual found loops.
>> >
>> > It also rewrites build_scop_depth/build_scop_breadth into something
>> > I can undestand.
>> >
>> > Bootstrap and regtest is running on x86_64-unknown-linux-gnu (graphite.exp
>> > for all langs is happy, so is SPEC CPU 2006 testing where the statistics
>> > agree before/after the patch).
>> >
>> > I'll apply this after the bootstrap finished.
>> >
>>
>> Have you tried to bootstrap with BOOT_CFLAGS="-O2 -fgraphite-identity"?
>
> I do "-O2 -g -floop-nest-optimize"

Very good.

> but I guess -fgraphite-identity
> should catch more issues?

It would systematically exercise the scop detection and code generation.
When isl scheduler does not find a better schedule, we would not bother
running the code gen part.

> Hmm, maybe -floop-nest-optimize and
> -fgraphite-identity should be combinable
>
> Index: gcc/graphite-optimize-isl.c
> ===
> --- gcc/graphite-optimize-isl.c (revision 253203)
> +++ gcc/graphite-optimize-isl.c (working copy)
> @@ -189,7 +189,7 @@ optimize_isl (scop_p scop)
> print_schedule_ast (dump_file, scop->original_schedule, scop);
>isl_schedule_free (scop->transformed_schedule);
>scop->transformed_schedule = isl_schedule_copy
> (scop->original_schedule);
> -  return false;
> +  return flag_graphite_identity || flag_loop_parallelize_all;

Yes.

>  }
>
>return true;
>
> I'll test/commit the above.

ok.

>
>>
>> > Richard.
>> >
>> > 2017-09-26  Richard Biener  
>> >
>> > * graphite-scop-detection.c (scop_detection::build_scop_depth):
>> > Rewrite,
>> > fold in ...
>> > (scop_detection::build_scop_breadth): ... this.  Removed.
>> > (scop_detection::loop_is_valid_in_scop): Fold into single caller.
>> > (scop_detection::harmful_stmt_in_bb): Likewise.
>> > (scop_detection::graphite_can_represent_stmt): Likewise.
>> > (scop_detection::loop_body_is_valid_scop): Likewise.  Remove
>> > recursion.
>> > (scop_detection::can_represent_loop): Remove recursion, fold in
>> > ...
>> > (scop_detection::can_represent_loop_1): ... this.  Removed.
>> > (scop_detection::harmful_loop_in_region): Simplify after inlining
>> > the above and remove more quadraticness.
>> > (build_scops): Adjust.
>> > * tree-data-ref.c (loop_nest_has_data_refs): Remove pointless
>> > quadraticness.
>> >
>> >
>> This goes in the right direction: it cuts down compilation time.
>> As it is not a trivial change, I need some time to understand how
>> the scop detection works with this change.
>
> The only functional change should be that the SESE composition now
> works top-down instead of working its way bottom-up.  It's not clear
> whether we do more or less work that way

So we went from top-down to bottom-up,
and now with this change we go back to top-down.
I think both algorithms are equivalent in terms of number
of times we validate statements.

We explained the current implementation of the scop detection in:
http://impact.gforge.inria.fr/impact2016/papers/impact2016-kumar.pdf
http://impact.gforge.inria.fr/impact2016/papers/impact2016-kumar-slides.pdf

Here is what happens on an example:

loop_1 {
  loop_2 {
stmt_1
  }
  stmt_2
  loop_3 {
stmt_3
  }
}

- with a top down scop detection, we would start the analysis with loop_1,
and start validating that every stmt in its body (stmt_1, stmt_2,
and finally stmt_3) can be represented in the polyhedral representation.
If at any moment the analysis returns "cannot represent", it would go one
level down and try to validate the immediate sub loop loop_2.
Let's assume that stmt_1 can be represented, and so it would try to
extend the scop by validating stmt_2 and then its sibling loop_3, and say
we fail on validating stmt_3.  All done, max scop is stmt_1 in loop_2
followed by stmt_2.

- with a bottom up we would start from the inner loop_2, it passes
validation of stmt_1, then we extend the scop by validating stmt_2,
and then we fail at validation of stmt_3.  All done, max scop is stmt_1 in
loop_2 followed by stmt_2.  In the bottom-up process we don't
have to validate the outer loop_1.

Supposing that there is no fail in the process, then a top-down detection
would be faster as it does not need to validate one by one the inner loops:
it just goes in one pass over the stmts of loop_1 body.

> I think we can 

Re: [PATCH][GRAPHITE] More TLC

2017-09-28 Thread Sebastian Pop
On Wed, Sep 27, 2017 at 9:33 AM, Richard Biener  wrote:
> Looks like even when hacking the Fortran FE to produce nested
> ARRAY_REFs we run into the same issue for
>
> (gdb) p debug_data_reference (dr)
> #(Data Ref:
> #  bb: 17
> #  stmt:
> VIEW_CONVERT_EXPR(*y_117(D))[_24]{lb:
> 1 sz: _20 * 8}[_26]{lb: 1 sz: _21 * 8}[_28]{lb: 1 sz: _22 * 8}[_29]{lb: 1
> sz: 8} = 0.0;
> #  ref:
> VIEW_CONVERT_EXPR(*y_117(D))[_24]{lb:
> 1 sz: _20 * 8}[_26]{lb: 1 sz: _21 * 8}[_28]{lb: 1 sz: _22 * 8}[_29]{lb: 1
> sz: 8};
> #  base_object:
> VIEW_CONVERT_EXPR(*y_117(D));
> #  Access function 0: {1, +, 1}_4
> #  Access function 1: (integer(kind=8)) {(unsigned long) stride.88_92, +,
> (unsigned long) stride.88_92}_3;
> #  Access function 2: (integer(kind=8)) {(unsigned long) stride.90_96, +,
> (unsigned long) stride.90_96}_2;
> #  Access function 3: (integer(kind=8)) {(unsigned long) stride.92_100, +,
> (unsigned long) stride.92_100}_1;
>
> so it looks like simple strided (where stride is a parameter) access
> is not handled either.

Yes, this is the first option I was mentioning: it could work,
could you please make sure that you don't have a bug in the "hack patch"
where the outer dimension should not contain the parameter
(inner array dimension) times the access function.

Example in C:
int A[100][N];
A[i][j] is linearized as *(A + i * N * 4 + j * 4)
and you may have a bug if you delinearized it in the Fortran FE as A[i * N][j]
Could you please check that it would delinearize back to A[i][j]?

>
> GCCs dependence analysis can at least compute distances of two
> DRs when the difference of the access CHRECs is constant.  Within
> the polyhedral model those cases cannot be handled?

The difficulty for the polyhedral model is in the representation
of a multiplication of parameter times loop index variable.
The delinearization removes these difficulties by creating linear expressions.

Think about multiplication as something introducing exponentiality
and you realize that any such expression would not fit in the
linear model of polyhedra.
A parameter is nothing else than an outer loop index to which we don't
have access to that loop level as it may be outside the current function
in which we get that parameter in.

Sebastian


Re: [PATCH][GRAPHITE] More TLC

2017-09-28 Thread Sebastian Pop
On Wed, Sep 27, 2017 at 8:04 AM, Richard Biener  wrote:
>
> Another thing I notice is that we don't handle the multi-dimensional
> accesses the fortran frontend produces:
>
> (gdb) p debug_data_reference (dr)
> #(Data Ref:
> #  bb: 18
> #  stmt: _43 = *a_141(D)[_42];
> #  ref: *a_141(D)[_42];
> #  base_object: *a_141(D);
> #  Access function 0: {{(_38 + stride.88_115) + 1, +, 1}_4, +,
> stride.88_115}_5
>
> ultimatively we fail here because we try to build a constraint for
>
> {{(_38 + stride.88_115) + 1, +, 1}_4, +, stride.88_115}_5
>
> which ends up computing isl_pw_aff_mul (A, stride.88_115) with
> A being the non-constant constraint generated for
> {(_38 + stride.88_115) + 1, +, 1}_4 and stride.88_115 being
> a parameter.  ISL doesn't like that multiplication as the result
> isn't affine (well - it is, we just have parameters in there).
>
> I suppose ISL doesn't handle this form of accesses given the
> two "dimensions" in this scalarized form may overlap?  So we'd
> really need to turn those into references with different access
> functions (even if that's not 100% a valid semantic transformation
> as scalarization isn't reversible without extra information)?

You are right.
This multivariate memory access would be better handled in
delinearized form:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66981
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61000
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=14741

There are two ways to handle this issue:
- fix the FORTRAN front-end to emit multi dimensions ARRAY_REFs,
- implement an array delinearization pass, as I implemented in LLVM
http://llvm.org/doxygen/Delinearization_8cpp_source.html
"On Recovering Multi-Dimensional Arrays in Polly"
http://impact.gforge.inria.fr/impact2015/papers/impact2015-grosser.pdf
"Optimistic Delinearization of Parametrically Sized Arrays"
https://dl.acm.org/citation.cfm?id=2751248
LLVM does not have an equivalent for multi-dim ARRAY_REF description
it only reasons about linearized memory accesses like in GCC's RTL:
gep = Get Element Pointer, so we had no other option than to delinearize.

Sebastian


Re: [PATCH][GRAPHITE] More TLC

2017-09-28 Thread Sebastian Pop
On Wed, Sep 27, 2017 at 7:18 AM, Richard Biener  wrote:

> On Tue, 26 Sep 2017, Sebastian Pop wrote:
>
> > On Mon, Sep 25, 2017 at 8:12 AM, Richard Biener 
> wrote:
> >
> > > On Fri, 22 Sep 2017, Sebastian Pop wrote:
> > >
> > > > On Fri, Sep 22, 2017 at 8:03 AM, Richard Biener 
> > > wrote:
> > > >
> > > > >
> > > > > This simplifies canonicalize_loop_closed_ssa and does other minimal
> > > > > TLC.  It also adds a testcase I reduced from a stupid mistake I
> made
> > > > > when reworking canonicalize_loop_closed_ssa.
> > > > >
> > > > > Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to
> trunk.
> > > > >
> > > > > SPEC CPU 2006 is happy with it, current statistics on x86_64 with
> > > > > -Ofast -march=haswell -floop-nest-optimize are
> > > > >
> > > > >  61 loop nests "optimized"
> > > > >  45 loop nest transforms cancelled because of code generation
> issues
> > > > >  21 loop nest optimizations timed out the 35 ISL "operations"
> we
> > > allow
> > > > >
> > > > > I say "optimized" because the usual transform I've seen is static
> > > tiling
> > > > > as enforced by GRAPHITE according to --param loop-block-tile-size.
> > > > > There's no way to automagically figure what kind of transform ISL
> did
> > > > >
> > > >
> > > > Here is how to automate (without magic) the detection
> > > > of the transform that isl did.
> > > >
> > > > The problem solved by isl is the minimization of strides
> > > > in memory, and to do this, we need to tell the isl scheduler
> > > > the validity dependence graph, in graphite-optimize-isl.c
> > > > see the validity (RAW, WAR, WAW) and the proximity
> > > > (RAR + validity) maps.  The proximity does include the
> > > > read after read, as the isl scheduler needs to minimize
> > > > strides between consecutive reads.
>
> Ah, so I now see why we do not perform interchange on trivial cases like
>
> double A[1024][1024], B[1024][1024];
>
> void foo(void)
> {
>   for (int i = 0; i < 1024; ++i)
> for (int j = 0; j < 1024; ++j)
>   A[j][i] = B[j][i];
> }
>
> which is probably because
>
>   /* FIXME: proximity should not be validity.  */
>   isl_union_map *proximity = isl_union_map_copy (validity);
>
> falls apart when there is _no_ dependence?
>

You are right.  The proximity needs to account for spatial
locality as well if you want to interchange the loop.
To describe the spatial locality, I would recommend adding
to the proximity relation the array accesses from two
successive iterations of the innermost loop:
A[j][i] -> A[j][i+1] and B[j][i] -> B[j][i+1]
With these two extra relations in the proximity map,
isl should be able to interchange the above loop.


>
> I can trick GRAPHITE into performing the interchange for
>
> double A[1024][1024], B[1024][1024];
>
> void foo(void)
> {
>   for (int i = 1; i < 1023; ++i)
> for (int j = 0; j < 1024; ++j)
>   A[j][i] = B[j][i-1] + A[j][i+1];
> }
>
> because now there is a dependence.  Any idea on how to rewrite
> scop_get_dependences to avoid "simplifying"?  I suppose the
> validity constraints _do_ also specify kind-of a proximity
>

Correct: the validity map specifies a subset (it is missing
RAR dependences) of data reuse.


> we just may not prune / optimize them in the same way as
> dependences?
>

Validity constraints are there to "keep the wind blowing
in the same direction" after the transform (otherwise the
result of the transformed computation may be wrong.)

The proximity map should contain a description of
- reuse of memory (temporal locality)
- how close together the access elements are (spatial locality.)
isl will optimize for both if the proximity map has a description
of both.

For the moment the proximity map is initialized only with the
current validity constraints, as you quoted the FIXME comment,
which would only describe a subset of the temporal locality.

Sebastian


Re: [PATCH][GRAPHITE] More TLC

2017-09-28 Thread Sebastian Pop
Hi skimo,

On Tue, Sep 26, 2017 at 10:15 AM, Sven Verdoolaege <
sven.verdoola...@gmail.com> wrote:

> On Tue, Sep 26, 2017 at 09:19:50AM -0500, Sebastian Pop wrote:
> > Sven, is there already a function that computes the sum of all
> > strides in a proximity map?  Maybe you have code that does
> > something similar in pet or ppcg?
>
> What exactly do you want to sum?

If this involves any counting, then it cannot currently
>

I think that it does involve counting: we need to know
the distance between all pairs of array accesses, that is the
number of points in the dependence polyhedron.


> be done in pet or ppcg since isl does not support counting yet
> and the public version of barvinok is GPL licensed.
>
> Also, it's better to ask such questions on the isl mailing list
> isl-developm...@googlegroups.com
>
>
We are trying to find a metric that shows that isl's scheduler
did a useful transform.  Something like a diff tool that shows
before and after scheduling the strides of array accesses.

Could the isl scheduler output a description of what it did?
We would like to use that output to build testcases that match
the behavior of the compiler on different patterns.

Thanks,
Sebastian


Re: [PATCH][GRAPHITE] More TLC

2017-09-27 Thread Richard Biener
On Wed, 27 Sep 2017, Richard Biener wrote:

> On Wed, 27 Sep 2017, Richard Biener wrote:
> 
> > On Tue, 26 Sep 2017, Sebastian Pop wrote:
> > 
> > > On Mon, Sep 25, 2017 at 8:12 AM, Richard Biener  wrote:
> > > 
> > > > On Fri, 22 Sep 2017, Sebastian Pop wrote:
> > > >
> > > > > On Fri, Sep 22, 2017 at 8:03 AM, Richard Biener 
> > > > wrote:
> > > > >
> > > > > >
> > > > > > This simplifies canonicalize_loop_closed_ssa and does other minimal
> > > > > > TLC.  It also adds a testcase I reduced from a stupid mistake I made
> > > > > > when reworking canonicalize_loop_closed_ssa.
> > > > > >
> > > > > > Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to 
> > > > > > trunk.
> > > > > >
> > > > > > SPEC CPU 2006 is happy with it, current statistics on x86_64 with
> > > > > > -Ofast -march=haswell -floop-nest-optimize are
> > > > > >
> > > > > >  61 loop nests "optimized"
> > > > > >  45 loop nest transforms cancelled because of code generation issues
> > > > > >  21 loop nest optimizations timed out the 35 ISL "operations" we
> > > > allow
> > > > > >
> > > > > > I say "optimized" because the usual transform I've seen is static
> > > > tiling
> > > > > > as enforced by GRAPHITE according to --param loop-block-tile-size.
> > > > > > There's no way to automagically figure what kind of transform ISL 
> > > > > > did
> > > > > >
> > > > >
> > > > > Here is how to automate (without magic) the detection
> > > > > of the transform that isl did.
> > > > >
> > > > > The problem solved by isl is the minimization of strides
> > > > > in memory, and to do this, we need to tell the isl scheduler
> > > > > the validity dependence graph, in graphite-optimize-isl.c
> > > > > see the validity (RAW, WAR, WAW) and the proximity
> > > > > (RAR + validity) maps.  The proximity does include the
> > > > > read after read, as the isl scheduler needs to minimize
> > > > > strides between consecutive reads.
> > 
> > Ah, so I now see why we do not perform interchange on trivial cases like
> > 
> > double A[1024][1024], B[1024][1024];
> > 
> > void foo(void)
> > {
> >   for (int i = 0; i < 1024; ++i)
> > for (int j = 0; j < 1024; ++j)
> >   A[j][i] = B[j][i];
> > }
> > 
> > which is probably because
> > 
> >   /* FIXME: proximity should not be validity.  */
> >   isl_union_map *proximity = isl_union_map_copy (validity);
> > 
> > falls apart when there is _no_ dependence?
> > 
> > I can trick GRAPHITE into performing the interchange for
> > 
> > double A[1024][1024], B[1024][1024];
> > 
> > void foo(void)
> > {
> >   for (int i = 1; i < 1023; ++i)
> > for (int j = 0; j < 1024; ++j)
> >   A[j][i] = B[j][i-1] + A[j][i+1];
> > }
> > 
> > because now there is a dependence.  Any idea on how to rewrite
> > scop_get_dependences to avoid "simplifying"?  I suppose the
> > validity constraints _do_ also specify kind-of a proximity
> > we just may not prune / optimize them in the same way as
> > dependences?
> 
> Another thing I notice is that we don't handle the multi-dimensional
> accesses the fortran frontend produces:
> 
> (gdb) p debug_data_reference (dr)
> #(Data Ref: 
> #  bb: 18 
> #  stmt: _43 = *a_141(D)[_42];
> #  ref: *a_141(D)[_42];
> #  base_object: *a_141(D);
> #  Access function 0: {{(_38 + stride.88_115) + 1, +, 1}_4, +, 
> stride.88_115}_5
> 
> ultimatively we fail here because we try to build a constraint for
> 
> {{(_38 + stride.88_115) + 1, +, 1}_4, +, stride.88_115}_5
> 
> which ends up computing isl_pw_aff_mul (A, stride.88_115) with
> A being the non-constant constraint generated for
> {(_38 + stride.88_115) + 1, +, 1}_4 and stride.88_115 being
> a parameter.  ISL doesn't like that multiplication as the result
> isn't affine (well - it is, we just have parameters in there).
> 
> I suppose ISL doesn't handle this form of accesses given the
> two "dimensions" in this scalarized form may overlap?  So we'd
> really need to turn those into references with different access
> functions (even if that's not 100% a valid semantic transformation
> as scalarization isn't reversible without extra information)?

Looks like even when hacking the Fortran FE to produce nested
ARRAY_REFs we run into the same issue for

(gdb) p debug_data_reference (dr)
#(Data Ref: 
#  bb: 17 
#  stmt: 
VIEW_CONVERT_EXPR(*y_117(D))[_24]{lb:
 
1 sz: _20 * 8}[_26]{lb: 1 sz: _21 * 8}[_28]{lb: 1 sz: _22 * 8}[_29]{lb: 1 
sz: 8} = 0.0;
#  ref: 
VIEW_CONVERT_EXPR(*y_117(D))[_24]{lb:
 
1 sz: _20 * 8}[_26]{lb: 1 sz: _21 * 8}[_28]{lb: 1 sz: _22 * 8}[_29]{lb: 1 
sz: 8};
#  base_object: 
VIEW_CONVERT_EXPR(*y_117(D));
#  Access function 0: {1, +, 1}_4
#  Access function 1: (integer(kind=8)) {(unsigned long) stride.88_92, +, 
(unsigned long) stride.88_92}_3;
#  Access function 2: (integer(kind=8)) {(unsigned 

Re: [PATCH][GRAPHITE] More TLC

2017-09-27 Thread Richard Biener
On Wed, 27 Sep 2017, Richard Biener wrote:

> On Tue, 26 Sep 2017, Sebastian Pop wrote:
> 
> > On Mon, Sep 25, 2017 at 8:12 AM, Richard Biener  wrote:
> > 
> > > On Fri, 22 Sep 2017, Sebastian Pop wrote:
> > >
> > > > On Fri, Sep 22, 2017 at 8:03 AM, Richard Biener 
> > > wrote:
> > > >
> > > > >
> > > > > This simplifies canonicalize_loop_closed_ssa and does other minimal
> > > > > TLC.  It also adds a testcase I reduced from a stupid mistake I made
> > > > > when reworking canonicalize_loop_closed_ssa.
> > > > >
> > > > > Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.
> > > > >
> > > > > SPEC CPU 2006 is happy with it, current statistics on x86_64 with
> > > > > -Ofast -march=haswell -floop-nest-optimize are
> > > > >
> > > > >  61 loop nests "optimized"
> > > > >  45 loop nest transforms cancelled because of code generation issues
> > > > >  21 loop nest optimizations timed out the 35 ISL "operations" we
> > > allow
> > > > >
> > > > > I say "optimized" because the usual transform I've seen is static
> > > tiling
> > > > > as enforced by GRAPHITE according to --param loop-block-tile-size.
> > > > > There's no way to automagically figure what kind of transform ISL did
> > > > >
> > > >
> > > > Here is how to automate (without magic) the detection
> > > > of the transform that isl did.
> > > >
> > > > The problem solved by isl is the minimization of strides
> > > > in memory, and to do this, we need to tell the isl scheduler
> > > > the validity dependence graph, in graphite-optimize-isl.c
> > > > see the validity (RAW, WAR, WAW) and the proximity
> > > > (RAR + validity) maps.  The proximity does include the
> > > > read after read, as the isl scheduler needs to minimize
> > > > strides between consecutive reads.
> 
> Ah, so I now see why we do not perform interchange on trivial cases like
> 
> double A[1024][1024], B[1024][1024];
> 
> void foo(void)
> {
>   for (int i = 0; i < 1024; ++i)
> for (int j = 0; j < 1024; ++j)
>   A[j][i] = B[j][i];
> }
> 
> which is probably because
> 
>   /* FIXME: proximity should not be validity.  */
>   isl_union_map *proximity = isl_union_map_copy (validity);
> 
> falls apart when there is _no_ dependence?
> 
> I can trick GRAPHITE into performing the interchange for
> 
> double A[1024][1024], B[1024][1024];
> 
> void foo(void)
> {
>   for (int i = 1; i < 1023; ++i)
> for (int j = 0; j < 1024; ++j)
>   A[j][i] = B[j][i-1] + A[j][i+1];
> }
> 
> because now there is a dependence.  Any idea on how to rewrite
> scop_get_dependences to avoid "simplifying"?  I suppose the
> validity constraints _do_ also specify kind-of a proximity
> we just may not prune / optimize them in the same way as
> dependences?

Another thing I notice is that we don't handle the multi-dimensional
accesses the fortran frontend produces:

(gdb) p debug_data_reference (dr)
#(Data Ref: 
#  bb: 18 
#  stmt: _43 = *a_141(D)[_42];
#  ref: *a_141(D)[_42];
#  base_object: *a_141(D);
#  Access function 0: {{(_38 + stride.88_115) + 1, +, 1}_4, +, 
stride.88_115}_5

ultimatively we fail here because we try to build a constraint for

{{(_38 + stride.88_115) + 1, +, 1}_4, +, stride.88_115}_5

which ends up computing isl_pw_aff_mul (A, stride.88_115) with
A being the non-constant constraint generated for
{(_38 + stride.88_115) + 1, +, 1}_4 and stride.88_115 being
a parameter.  ISL doesn't like that multiplication as the result
isn't affine (well - it is, we just have parameters in there).

I suppose ISL doesn't handle this form of accesses given the
two "dimensions" in this scalarized form may overlap?  So we'd
really need to turn those into references with different access
functions (even if that's not 100% a valid semantic transformation
as scalarization isn't reversible without extra information)?

Thanks,
Richard.


[PATCH][GRAPHITE] Make --param loop-block-tile-size=0 disable tiling

2017-09-27 Thread Richard Biener

Currently ISL aborts on this special value and for debugging (and 
tuning?) it's nice to avoid all the clutter introduced by tiling.

Committed as obvious.

Richard.

2017-09-27  Richard Biener  

* graphite-optimize-isl.c (get_schedule_for_node_st): Allow
--param loop-block-tile-size=0 to disable tiling.

Index: gcc/graphite-optimize-isl.c
===
--- gcc/graphite-optimize-isl.c (revision 253226)
+++ gcc/graphite-optimize-isl.c (working copy)
@@ -64,7 +64,10 @@ get_schedule_for_node_st (__isl_take isl
   if (type != isl_schedule_node_leaf)
 return node;
 
-  if (dims <= 1 || !isl_schedule_node_band_get_permutable (node))
+  long tile_size = PARAM_VALUE (PARAM_LOOP_BLOCK_TILE_SIZE);
+  if (dims <= 1
+  || tile_size == 0
+  || !isl_schedule_node_band_get_permutable (node))
 {
   if (dump_file && dump_flags)
fprintf (dump_file, "not tiled\n");
@@ -74,7 +77,6 @@ get_schedule_for_node_st (__isl_take isl
   /* Tile loops.  */
   space = isl_schedule_node_band_get_space (node);
   isl_multi_val *sizes = isl_multi_val_zero (space);
-  long tile_size = PARAM_VALUE (PARAM_LOOP_BLOCK_TILE_SIZE);
   isl_ctx *ctx = isl_schedule_node_get_ctx (node);
 
   for (unsigned i = 0; i < dims; i++)


Re: [PATCH][GRAPHITE] More TLC

2017-09-27 Thread Richard Biener
On Tue, 26 Sep 2017, Sebastian Pop wrote:

> On Mon, Sep 25, 2017 at 8:12 AM, Richard Biener  wrote:
> 
> > On Fri, 22 Sep 2017, Sebastian Pop wrote:
> >
> > > On Fri, Sep 22, 2017 at 8:03 AM, Richard Biener 
> > wrote:
> > >
> > > >
> > > > This simplifies canonicalize_loop_closed_ssa and does other minimal
> > > > TLC.  It also adds a testcase I reduced from a stupid mistake I made
> > > > when reworking canonicalize_loop_closed_ssa.
> > > >
> > > > Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.
> > > >
> > > > SPEC CPU 2006 is happy with it, current statistics on x86_64 with
> > > > -Ofast -march=haswell -floop-nest-optimize are
> > > >
> > > >  61 loop nests "optimized"
> > > >  45 loop nest transforms cancelled because of code generation issues
> > > >  21 loop nest optimizations timed out the 35 ISL "operations" we
> > allow
> > > >
> > > > I say "optimized" because the usual transform I've seen is static
> > tiling
> > > > as enforced by GRAPHITE according to --param loop-block-tile-size.
> > > > There's no way to automagically figure what kind of transform ISL did
> > > >
> > >
> > > Here is how to automate (without magic) the detection
> > > of the transform that isl did.
> > >
> > > The problem solved by isl is the minimization of strides
> > > in memory, and to do this, we need to tell the isl scheduler
> > > the validity dependence graph, in graphite-optimize-isl.c
> > > see the validity (RAW, WAR, WAW) and the proximity
> > > (RAR + validity) maps.  The proximity does include the
> > > read after read, as the isl scheduler needs to minimize
> > > strides between consecutive reads.

Ah, so I now see why we do not perform interchange on trivial cases like

double A[1024][1024], B[1024][1024];

void foo(void)
{
  for (int i = 0; i < 1024; ++i)
for (int j = 0; j < 1024; ++j)
  A[j][i] = B[j][i];
}

which is probably because

  /* FIXME: proximity should not be validity.  */
  isl_union_map *proximity = isl_union_map_copy (validity);

falls apart when there is _no_ dependence?

I can trick GRAPHITE into performing the interchange for

double A[1024][1024], B[1024][1024];

void foo(void)
{
  for (int i = 1; i < 1023; ++i)
for (int j = 0; j < 1024; ++j)
  A[j][i] = B[j][i-1] + A[j][i+1];
}

because now there is a dependence.  Any idea on how to rewrite
scop_get_dependences to avoid "simplifying"?  I suppose the
validity constraints _do_ also specify kind-of a proximity
we just may not prune / optimize them in the same way as
dependences?

Richard.



[PATCH][GRAPHITE] Allow --param graphite-max-arrays-per-scop=0

2017-09-27 Thread Richard Biener

The following is to allow making --param graphite-max-arrays-per-scop
unbounded.  That's a little tricky because the bound is used when
computing "alias-sets" for scalar constraints.  There's an easy way
out though as we know the maximum alias-set assigned in the SCOP,
we only have to remember it.  The advantage (if it matters at all)
is that we avoid a constraint coefficient gap between that last
used alias-set and the former PARAM_GRAPHITE_MAX_ARRAYS_PER_SCOP.

Bootstrap and regtest running on x86_64-unknown-linux-gnu, SPEC CPU 2006
tested.  Will apply after testing finished.

Richard.

2017-09-27  Richard Biener  

* graphite.h (scop::max_alias_set): New member.
* graphite-scop-detection.c: Remove references to non-existing
--param in comments.
(build_alias_sets): Record the maximum alias set used for drs.
(build_scops): Support zero as unlimited for
--param graphite-max-arrays-per-scop.
* graphite-sese-to-poly.c (add_scalar_version_numbers): Remove
and inline into ...
(build_poly_sr_1): ... here.  Compute alias set based on the
maximum alias set used for drs rather than
PARAM_GRAPHITE_MAX_ARRAYS_PER_SCOP

Index: gcc/graphite-scop-detection.c
===
--- gcc/graphite-scop-detection.c   (revision 253226)
+++ gcc/graphite-scop-detection.c   (working copy)
@@ -389,10 +389,7 @@ public:
 
   void remove_intersecting_scops (sese_l s1);
 
-  /* Return true when a statement in SCOP cannot be represented by Graphite.
- The assumptions are that L1 dominates L2, and SCOP->entry dominates L1.
- Limit the number of bbs between adjacent loops to
- PARAM_SCOP_MAX_NUM_BBS_BETWEEN_LOOPS.  */
+  /* Return true when a statement in SCOP cannot be represented by Graphite.  
*/
 
   bool harmful_loop_in_region (sese_l scop) const;
 
@@ -760,10 +757,7 @@ scop_detection::add_scop (sese_l s)
   DEBUG_PRINT (dp << "[scop-detection] Adding SCoP: "; print_sese (dump_file, 
s));
 }
 
-/* Return true when a statement in SCOP cannot be represented by Graphite.
-   The assumptions are that L1 dominates L2, and SCOP->entry dominates L1.
-   Limit the number of bbs between adjacent loops to
-   PARAM_SCOP_MAX_NUM_BBS_BETWEEN_LOOPS.  */
+/* Return true when a statement in SCOP cannot be represented by Graphite.  */
 
 bool
 scop_detection::harmful_loop_in_region (sese_l scop) const
@@ -1531,7 +1525,8 @@ build_alias_set (scop_p scop)
   for (i = 0; i < num_vertices; i++)
 all_vertices[i] = i;
 
-  graphds_dfs (g, all_vertices, num_vertices, NULL, true, NULL);
+  scop->max_alias_set
+= graphds_dfs (g, all_vertices, num_vertices, NULL, true, NULL) + 1;
   free (all_vertices);
 
   for (i = 0; i < g->n_vertices; i++)
@@ -1755,7 +1750,8 @@ build_scops (vec *scops)
}
 
   unsigned max_arrays = PARAM_VALUE (PARAM_GRAPHITE_MAX_ARRAYS_PER_SCOP);
-  if (scop->drs.length () >= max_arrays)
+  if (max_arrays > 0
+ && scop->drs.length () >= max_arrays)
{
  DEBUG_PRINT (dp << "[scop-detection-fail] too many data references: "
   << scop->drs.length ()
Index: gcc/graphite-sese-to-poly.c
===
--- gcc/graphite-sese-to-poly.c (revision 253225)
+++ gcc/graphite-sese-to-poly.c (working copy)
@@ -491,25 +491,6 @@ pdr_add_alias_set (isl_map *acc, dr_info
   return isl_map_add_constraint (acc, c);
 }
 
-/* Add a constrain to the ACCESSES polyhedron for the alias set of
-   data reference DR.  ACCESSP_NB_DIMS is the dimension of the
-   ACCESSES polyhedron, DOM_NB_DIMS is the dimension of the iteration
-   domain.  */
-
-static isl_map *
-add_scalar_version_numbers (isl_map *acc, tree var)
-{
-  isl_constraint *c = isl_equality_alloc
-  (isl_local_space_from_space (isl_map_get_space (acc)));
-  int max_arrays = PARAM_VALUE (PARAM_GRAPHITE_MAX_ARRAYS_PER_SCOP);
-  /* Each scalar variables has a unique alias set number starting from
- max_arrays.  */
-  c = isl_constraint_set_constant_si (c, -max_arrays - SSA_NAME_VERSION (var));
-  c = isl_constraint_set_coefficient_si (c, isl_dim_out, 0, 1);
-
-  return isl_map_add_constraint (acc, c);
-}
-
 /* Assign the affine expression INDEX to the output dimension POS of
MAP and return the result.  */
 
@@ -684,13 +665,21 @@ static void
 build_poly_sr_1 (poly_bb_p pbb, gimple *stmt, tree var, enum poly_dr_type kind,
 isl_map *acc, isl_set *subscript_sizes)
 {
-  int max_arrays = PARAM_VALUE (PARAM_GRAPHITE_MAX_ARRAYS_PER_SCOP);
+  scop_p scop = PBB_SCOP (pbb);
   /* Each scalar variables has a unique alias set number starting from
- max_arrays.  */
+ the maximum alias set assigned to a dr.  */
+  int alias_set = scop->max_alias_set + SSA_NAME_VERSION (var);
   subscript_sizes = isl_set_fix_si (subscript_sizes, isl_dim_set, 0,
-   

[PATCH][GRAPHITE] Remove another small quadraticness

2017-09-27 Thread Richard Biener

Turns out loop_nest recorded in scop-info isn't really necessary as
we can simply process parameters in loop bounds during the gather_bbs
walk where we encounter each loop (identified by its header) once.

This avoids the linear search in record_loop_in_sese.

Bootstrap / regtest running on x86_64-unknown-linux-gnu, will apply.

Richard.

2017-09-27  Richard Biener  

* graphite-scop-detection.c (find_scop_parameters): Move
loop bound handling ...
(gather_bbs::before_dom_children): ... here, avoiding the need
to build scop_info->loop_nest.
(record_loop_in_sese): Remove.
* sese.h (sese_info_t::loop_nest): Remove.
* sese.c (new_sese_info): Do not allocate loop_nest.
(free_sese_info): Do not free loop_nest.

Index: gcc/graphite-scop-detection.c
===
--- gcc/graphite-scop-detection.c   (revision 253226)
+++ gcc/graphite-scop-detection.c   (working copy)
@@ -1330,7 +1324,7 @@ find_params_in_bb (sese_info_p region, g
 }
 }
 
-/* Record the parameters used in the SCOP.  A variable is a parameter
+/* Record the parameters used in the SCOP BBs.  A variable is a parameter
in a scop if it does not vary during the execution of that scop.  */
 
 static void
@@ -1338,19 +1332,8 @@ find_scop_parameters (scop_p scop)
 {
   unsigned i;
   sese_info_p region = scop->scop_info;
-  struct loop *loop;
 
-  /* Find the parameters used in the loop bounds.  */
-  FOR_EACH_VEC_ELT (region->loop_nest, i, loop)
-{
-  tree nb_iters = number_of_latch_executions (loop);
-
-  if (!chrec_contains_symbols (nb_iters))
-   continue;
-
-  nb_iters = scalar_evolution_in_region (region->region, loop, nb_iters);
-  scan_tree_for_params (region, nb_iters);
-}
+  /* Parameters used in loop bounds are processed during gather_bbs.  */
 
   /* Find the parameters used in data accesses.  */
   poly_bb_p pbb;
@@ -1560,28 +1544,6 @@ gather_bbs::gather_bbs (cdi_direction di
 {
 }
 
-/* Record in execution order the loops fully contained in the region.  */
-
-static void
-record_loop_in_sese (basic_block bb, sese_info_p region)
-{
-  loop_p father = bb->loop_father;
-  if (loop_in_sese_p (father, region->region))
-{
-  bool found = false;
-  loop_p loop0;
-  int j;
-  FOR_EACH_VEC_ELT (region->loop_nest, j, loop0)
-   if (father == loop0)
- {
-   found = true;
-   break;
- }
-  if (!found)
-   region->loop_nest.safe_push (father);
-}
-}
-
 /* Call-back for dom_walk executed before visiting the dominated
blocks.  */
 
@@ -1592,7 +1554,20 @@ gather_bbs::before_dom_children (basic_b
   if (!bb_in_sese_p (bb, region->region))
 return dom_walker::STOP;
 
-  record_loop_in_sese (bb, region);
+  /* For loops fully contained in the region record parameters in the
+ loop bounds.  */
+  loop_p loop = bb->loop_father;
+  if (loop->header == bb
+  && loop_in_sese_p (loop, region->region))
+{
+  tree nb_iters = number_of_latch_executions (loop);
+  if (chrec_contains_symbols (nb_iters))
+   {
+ nb_iters = scalar_evolution_in_region (region->region,
+loop, nb_iters);
+ scan_tree_for_params (region, nb_iters);
+   }
+}
 
   gcond *stmt = single_pred_cond_non_loop_exit (bb);
 
Index: gcc/sese.c
===
--- gcc/sese.c  (revision 253226)
+++ gcc/sese.c  (working copy)
@@ -179,7 +179,6 @@ new_sese_info (edge entry, edge exit)
 
   region->region.entry = entry;
   region->region.exit = exit;
-  region->loop_nest.create (3);
   region->params.create (3);
   region->rename_map = new rename_map_t;
   region->parameter_rename_map = new parameter_rename_map_t;
@@ -197,7 +196,6 @@ void
 free_sese_info (sese_info_p region)
 {
   region->params.release ();
-  region->loop_nest.release ();
 
   for (rename_map_t::iterator it = region->rename_map->begin ();
it != region->rename_map->end (); ++it)
Index: gcc/sese.h
===
--- gcc/sese.h  (revision 253226)
+++ gcc/sese.h  (working copy)
@@ -94,9 +94,6 @@ typedef struct sese_info_t
   /* Parameters to be renamed.  */
   parameter_rename_map_t *parameter_rename_map;
 
-  /* Loops completely contained in this SESE.  */
-  vec loop_nest;
-
   /* Basic blocks contained in this SESE.  */
   vec bbs;
 


[PATCH][GRAPHITE] Speedup SCOP detection some more, add region handling to domwalk

2017-09-27 Thread Richard Biener

This removes another quadraticness from SCOP detection, gather_bbs
domwalk.  This is done by enhancing domwalk to handle SEME regions
via a special return value from before_dom_children.

With this I'm now confident to remove the 
PARAM_GRAPHITE_MAX_BBS_PER_FUNCTION parameter and its associated limit.
Being there I've adjusted PARAM_GRAPHITE_MAX_NB_SCOP_PARAMS to its
documented default value which enables 90 more loos to be processed
in SPEC CPU 2006.  I've also made a value of zero magic in disabling
the limit (a trick commonly used in GCC).

Statistics I have gathered a few patches before for SPEC CPU 2006:

1255 multi-loop SESEs in SCOP processing
max. params 34, 3 scops >= 20, 15 scops >= 10, 33 scops >= 8
max. drs per scop 869, 10 scops >= 100
max. pbbs per scop 36, 12 scops >= 10
919 SCOPs fail in build_alias_sets

which shows the default for PARAM_GRAPHITE_MAX_ARRAYS_PER_SCOP
is reasonable (if tuned to SPEC CPU 2006).

I've also included the hunk that allows -fgraphite-identity
to work ontop of -floop-nest-optimize and for -floop-nest-optimize
-ftree-parallelize-all also make sure to code-gen loops that
end up not transformed.

Bootstrapped and tested on x86_64-unknown-linux-gnu, SPEC CPU 2006
tested, applied to trunk.

Richard.

2017-09-27  Richard Biener  

* doc/invoke.texi (graphite-max-bbs-per-function): Remove.
(graphite-max-nb-scop-params): Document special value zero.
* domwalk.h (dom_walker::STOP): New symbolical constant.
(dom_walker::dom_walker): Add optional parameter for bb to
RPO mapping.
(dom_walker::~dom_walker): Declare.
(dom_walker::before_dom_children): Document STOP return value.
(dom_walker::m_user_bb_to_rpo): New member.
(dom_walker::m_bb_to_rpo): Likewise.
* domwalk.c (dom_walker::dom_walker): Compute bb to RPO
mapping here if not provided by the user.
(dom_walker::~dom_walker): Free bb to RPO mapping if not
provided by the user.
(dom_walker::STOP): Define.
(dom_walker::walk): Do not compute bb to RPO mapping here.
Support STOP return value from before_dom_children to stop
walking.
* graphite-optimize-isl.c (optimize_isl): If the schedule
is the same still generate code if -fgraphite-identity
or -floop-parallelize-all are given.
* graphite-scop-detection.c: Include cfganal.h.
(gather_bbs::gather_bbs): Get and pass through bb to RPO
mapping.
(gather_bbs::before_dom_children): Return STOP for BBs
not in the region.
(build_scops): Compute bb to RPO mapping and pass it to
the domwalk.  Treat --param graphite-max-nb-scop-params=0
as not limiting the number of params.
* graphite.c (graphite_initialize): Remove limit on the
number of basic-blocks in a function.
* params.def (PARAM_GRAPHITE_MAX_BBS_PER_FUNCTION): Remove.
(PARAM_GRAPHITE_MAX_NB_SCOP_PARAMS): Adjust to documented
default value of 10.

Index: gcc/doc/invoke.texi
===
--- gcc/doc/invoke.texi (revision 253224)
+++ gcc/doc/invoke.texi (working copy)
@@ -10512,13 +10512,9 @@ sequence pairs.  This option only applie
 @item graphite-max-nb-scop-params
 To avoid exponential effects in the Graphite loop transforms, the
 number of parameters in a Static Control Part (SCoP) is bounded.  The
-default value is 10 parameters.  A variable whose value is unknown at
-compilation time and defined outside a SCoP is a parameter of the SCoP.
-
-@item graphite-max-bbs-per-function
-To avoid exponential effects in the detection of SCoPs, the size of
-the functions analyzed by Graphite is bounded.  The default value is
-100 basic blocks.
+default value is 10 parameters, a value of zero can be used to lift
+the bound.  A variable whose value is unknown at compilation time and
+defined outside a SCoP is a parameter of the SCoP.
 
 @item loop-block-tile-size
 Loop blocking or strip mining transforms, enabled with
Index: gcc/domwalk.c
===
--- gcc/domwalk.c   (revision 253224)
+++ gcc/domwalk.c   (working copy)
@@ -174,13 +174,29 @@ sort_bbs_postorder (basic_block *bbs, in
If SKIP_UNREACHBLE_BLOCKS is true, then we need to set
EDGE_EXECUTABLE on every edge in the CFG. */
 dom_walker::dom_walker (cdi_direction direction,
-   bool skip_unreachable_blocks)
+   bool skip_unreachable_blocks,
+   int *bb_index_to_rpo)
   : m_dom_direction (direction),
 m_skip_unreachable_blocks (skip_unreachable_blocks),
-m_unreachable_dom (NULL)
+m_user_bb_to_rpo (bb_index_to_rpo != NULL),
+m_unreachable_dom (NULL),
+m_bb_to_rpo (bb_index_to_rpo)
 {
+  /* Compute the basic-block index to RPO mapping if not provided by
+ the user.  */
+  if (! m_bb_to_rpo && direction 

Re: [PATCH][GRAPHITE] Simplify SCOP detection

2017-09-27 Thread Richard Biener
On Tue, 26 Sep 2017, Sebastian Pop wrote:

> On Tue, Sep 26, 2017 at 7:03 AM, Richard Biener  wrote:
> 
> >
> > The following is the result of me trying to understand SCOP detection
> > and the validity checks spread around the machinery.  It removes several
> > quadraticnesses by folding validity checks into
> > scop_detection::harmful_loop_in_region where we already walk over all
> > BBs in the region and process individual found loops.
> >
> > It also rewrites build_scop_depth/build_scop_breadth into something
> > I can undestand.
> >
> > Bootstrap and regtest is running on x86_64-unknown-linux-gnu (graphite.exp
> > for all langs is happy, so is SPEC CPU 2006 testing where the statistics
> > agree before/after the patch).
> >
> > I'll apply this after the bootstrap finished.
> >
> 
> Have you tried to bootstrap with BOOT_CFLAGS="-O2 -fgraphite-identity"?

I do "-O2 -g -floop-nest-optimize" but I guess -fgraphite-identity
should catch more issues?  Hmm, maybe -floop-nest-optimize and
-fgraphite-identity should be combinable

Index: gcc/graphite-optimize-isl.c
===
--- gcc/graphite-optimize-isl.c (revision 253203)
+++ gcc/graphite-optimize-isl.c (working copy)
@@ -189,7 +189,7 @@ optimize_isl (scop_p scop)
print_schedule_ast (dump_file, scop->original_schedule, scop);
   isl_schedule_free (scop->transformed_schedule);
   scop->transformed_schedule = isl_schedule_copy 
(scop->original_schedule);
-  return false;
+  return flag_graphite_identity || flag_loop_parallelize_all;
 }
 
   return true;

I'll test/commit the above.

> 
> > Richard.
> >
> > 2017-09-26  Richard Biener  
> >
> > * graphite-scop-detection.c (scop_detection::build_scop_depth):
> > Rewrite,
> > fold in ...
> > (scop_detection::build_scop_breadth): ... this.  Removed.
> > (scop_detection::loop_is_valid_in_scop): Fold into single caller.
> > (scop_detection::harmful_stmt_in_bb): Likewise.
> > (scop_detection::graphite_can_represent_stmt): Likewise.
> > (scop_detection::loop_body_is_valid_scop): Likewise.  Remove
> > recursion.
> > (scop_detection::can_represent_loop): Remove recursion, fold in
> > ...
> > (scop_detection::can_represent_loop_1): ... this.  Removed.
> > (scop_detection::harmful_loop_in_region): Simplify after inlining
> > the above and remove more quadraticness.
> > (build_scops): Adjust.
> > * tree-data-ref.c (loop_nest_has_data_refs): Remove pointless
> > quadraticness.
> >
> >
> This goes in the right direction: it cuts down compilation time.
> As it is not a trivial change, I need some time to understand how
> the scop detection works with this change.

The only functional change should be that the SESE composition now
works top-down instead of working its way bottom-up.  It's not clear
whether we do more or less work that way but at least the function
is now readable -- I think we can structure it bottom-up as well
without doing the confusing two-function way (which I believe
did quite some duplicate work but I never was sure...).

Richard.


Re: [PATCH][GRAPHITE] More TLC

2017-09-26 Thread Sven Verdoolaege
On Tue, Sep 26, 2017 at 09:19:50AM -0500, Sebastian Pop wrote:
> Sven, is there already a function that computes the sum of all
> strides in a proximity map?  Maybe you have code that does
> something similar in pet or ppcg?

What exactly do you want to sum?
If this involves any counting, then it cannot currently
be done in pet or ppcg since isl does not support counting yet
and the public version of barvinok is GPL licensed.

Also, it's better to ask such questions on the isl mailing list
isl-developm...@googlegroups.com

skimo


Re: [PATCH][GRAPHITE] Simplify SCOP detection

2017-09-26 Thread Sebastian Pop
On Tue, Sep 26, 2017 at 7:03 AM, Richard Biener  wrote:

>
> The following is the result of me trying to understand SCOP detection
> and the validity checks spread around the machinery.  It removes several
> quadraticnesses by folding validity checks into
> scop_detection::harmful_loop_in_region where we already walk over all
> BBs in the region and process individual found loops.
>
> It also rewrites build_scop_depth/build_scop_breadth into something
> I can undestand.
>
> Bootstrap and regtest is running on x86_64-unknown-linux-gnu (graphite.exp
> for all langs is happy, so is SPEC CPU 2006 testing where the statistics
> agree before/after the patch).
>
> I'll apply this after the bootstrap finished.
>

Have you tried to bootstrap with BOOT_CFLAGS="-O2 -fgraphite-identity"?


> Richard.
>
> 2017-09-26  Richard Biener  
>
> * graphite-scop-detection.c (scop_detection::build_scop_depth):
> Rewrite,
> fold in ...
> (scop_detection::build_scop_breadth): ... this.  Removed.
> (scop_detection::loop_is_valid_in_scop): Fold into single caller.
> (scop_detection::harmful_stmt_in_bb): Likewise.
> (scop_detection::graphite_can_represent_stmt): Likewise.
> (scop_detection::loop_body_is_valid_scop): Likewise.  Remove
> recursion.
> (scop_detection::can_represent_loop): Remove recursion, fold in
> ...
> (scop_detection::can_represent_loop_1): ... this.  Removed.
> (scop_detection::harmful_loop_in_region): Simplify after inlining
> the above and remove more quadraticness.
> (build_scops): Adjust.
> * tree-data-ref.c (loop_nest_has_data_refs): Remove pointless
> quadraticness.
>
>
This goes in the right direction: it cuts down compilation time.
As it is not a trivial change, I need some time to understand how
the scop detection works with this change.

Sebastian


Re: [PATCH][GRAPHITE] More TLC

2017-09-26 Thread Sebastian Pop
On Mon, Sep 25, 2017 at 8:12 AM, Richard Biener  wrote:

> On Fri, 22 Sep 2017, Sebastian Pop wrote:
>
> > On Fri, Sep 22, 2017 at 8:03 AM, Richard Biener 
> wrote:
> >
> > >
> > > This simplifies canonicalize_loop_closed_ssa and does other minimal
> > > TLC.  It also adds a testcase I reduced from a stupid mistake I made
> > > when reworking canonicalize_loop_closed_ssa.
> > >
> > > Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.
> > >
> > > SPEC CPU 2006 is happy with it, current statistics on x86_64 with
> > > -Ofast -march=haswell -floop-nest-optimize are
> > >
> > >  61 loop nests "optimized"
> > >  45 loop nest transforms cancelled because of code generation issues
> > >  21 loop nest optimizations timed out the 35 ISL "operations" we
> allow
> > >
> > > I say "optimized" because the usual transform I've seen is static
> tiling
> > > as enforced by GRAPHITE according to --param loop-block-tile-size.
> > > There's no way to automagically figure what kind of transform ISL did
> > >
> >
> > Here is how to automate (without magic) the detection
> > of the transform that isl did.
> >
> > The problem solved by isl is the minimization of strides
> > in memory, and to do this, we need to tell the isl scheduler
> > the validity dependence graph, in graphite-optimize-isl.c
> > see the validity (RAW, WAR, WAW) and the proximity
> > (RAR + validity) maps.  The proximity does include the
> > read after read, as the isl scheduler needs to minimize
> > strides between consecutive reads.
> >
> > When you apply the schedule to the dependence graph,
> > one can tell from the result the strides in memory, a good
> > way to say whether a transform was beneficial is to sum up
> > all memory strides, and make sure that the sum of all strides
> > decreases after transform.  We could add a printf with the
> > sum of strides before and after transforms, and have the
> > testcases check for that.
>
> Interesting.  Can you perhaps show me in code how to do that?
>
>
Sven, is there already a function that computes the sum of all
strides in a proximity map?  Maybe you have code that does
something similar in pet or ppcg?

Thanks,
Sebastian


Re: [PATCH][GRAPHITE] More -fopt-info, do not abort from ISL

2017-09-26 Thread Sebastian Pop
On Mon, Sep 25, 2017 at 4:47 AM, Richard Biener  wrote:

>
> The following also dumps if the optimized schedule is equal to the
> original one.  It also makes all ISL operations (well, nearly) not
> abort on errors but instead propagate errors upward.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.
>
> Richard.
>
> 2017-09-25  Richard Biener  
>
> * graphite-optimize-isl.c (optimize_isl): Fail and dump if
> ISL errors other than isl_error_quota happen.  Dump if the
> schedule is the same.
> * graphite-sese-to-poly.c (build_poly_scop): Fail on ISL
> errors instead of aborting inside ISL.
>
>
Looks good.


[PATCH][GRAPHITE] Simplify SCOP detection

2017-09-26 Thread Richard Biener

The following is the result of me trying to understand SCOP detection
and the validity checks spread around the machinery.  It removes several
quadraticnesses by folding validity checks into 
scop_detection::harmful_loop_in_region where we already walk over all
BBs in the region and process individual found loops.

It also rewrites build_scop_depth/build_scop_breadth into something
I can undestand.

Bootstrap and regtest is running on x86_64-unknown-linux-gnu (graphite.exp
for all langs is happy, so is SPEC CPU 2006 testing where the statistics
agree before/after the patch).

I'll apply this after the bootstrap finished.

Richard.

2017-09-26  Richard Biener  

* graphite-scop-detection.c (scop_detection::build_scop_depth): Rewrite,
fold in ...
(scop_detection::build_scop_breadth): ... this.  Removed.
(scop_detection::loop_is_valid_in_scop): Fold into single caller.
(scop_detection::harmful_stmt_in_bb): Likewise.
(scop_detection::graphite_can_represent_stmt): Likewise.
(scop_detection::loop_body_is_valid_scop): Likewise.  Remove recursion.
(scop_detection::can_represent_loop): Remove recursion, fold in ...
(scop_detection::can_represent_loop_1): ... this.  Removed.
(scop_detection::harmful_loop_in_region): Simplify after inlining
the above and remove more quadraticness.
(build_scops): Adjust.
* tree-data-ref.c (loop_nest_has_data_refs): Remove pointless
quadraticness.


Index: gcc/graphite-scop-detection.c
===
--- gcc/graphite-scop-detection.c   (revision 253199)
+++ gcc/graphite-scop-detection.c   (working copy)
@@ -362,17 +362,7 @@ public:
 
   /* Build scop outer->inner if possible.  */
 
-  sese_l build_scop_depth (sese_l s, loop_p loop);
-
-  /* If loop and loop->next are valid scops, try to merge them.  */
-
-  sese_l build_scop_breadth (sese_l s1, loop_p loop);
-
-  /* Return true when LOOP is a valid scop, that is a Static Control Part, a
- region of code that can be represented in the polyhedral model.  SCOP
- defines the region we analyse.  */
-
-  bool loop_is_valid_in_scop (loop_p loop, sese_l scop) const;
+  void build_scop_depth (loop_p loop);
 
   /* Return true when BEGIN is the preheader edge of a loop with a single exit
  END.  */
@@ -398,18 +388,6 @@ public:
 
   void remove_intersecting_scops (sese_l s1);
 
-  /* Return true when the body of LOOP has statements that can be represented
- as a valid scop.  */
-
-  bool loop_body_is_valid_scop (loop_p loop, sese_l scop) const;
-
-  /* Return true when BB contains a harmful operation for a scop: that
- can be a function call with side effects, the induction variables
- are not linear with respect to SCOP, etc.  The current open
- scop should end before this statement.  */
-
-  bool harmful_stmt_in_bb (sese_l scop, basic_block bb) const;
-
   /* Return true when a statement in SCOP cannot be represented by Graphite.
  The assumptions are that L1 dominates L2, and SCOP->entry dominates L1.
  Limit the number of bbs between adjacent loops to
@@ -467,19 +445,12 @@ public:
  FIXME: For the moment, graphite cannot be used on loops that iterate using
  induction variables that wrap.  */
 
-  static bool can_represent_loop_1 (loop_p loop, sese_l scop);
-
-  /* Return true when all the loops within LOOP can be represented by
- Graphite.  */
-
   static bool can_represent_loop (loop_p loop, sese_l scop);
 
   /* Returns the number of pbbs that are in loops contained in SCOP.  */
 
   static int nb_pbbs_in_loops (scop_p scop);
 
-  static bool graphite_can_represent_stmt (sese_l, gimple *, basic_block);
-
 private:
   vec scops;
 };
@@ -673,10 +644,6 @@ scop_detection::merge_sese (sese_l first
   return invalid_sese;
 }
 
-  /* Analyze all the BBs in new sese.  */
-  if (harmful_loop_in_region (combined))
-return invalid_sese;
-
   DEBUG_PRINT (dp << "[merged-sese] s1: "; print_sese (dump_file, combined));
 
   return combined;
@@ -684,71 +651,40 @@ scop_detection::merge_sese (sese_l first
 
 /* Build scop outer->inner if possible.  */
 
-sese_l
-scop_detection::build_scop_depth (sese_l s, loop_p loop)
-{
-  if (!loop)
-return s;
-
-  DEBUG_PRINT (dp << "[Depth loop_" << loop->num << "]\n");
-  s = build_scop_depth (s, loop->inner);
-
-  sese_l s2 = merge_sese (s, get_sese (loop));
-  if (!s2)
-{
-  /* s might be a valid scop, so return it and start analyzing from the
-adjacent loop.  */
-  build_scop_depth (invalid_sese, loop->next);
-  return s;
-}
-
-  if (!loop_is_valid_in_scop (loop, s2))
-return build_scop_depth (invalid_sese, loop->next);
-
-  return build_scop_breadth (s2, loop);
-}
-
-/* If loop and loop->next are valid scops, try to merge them.  */
-
-sese_l
-scop_detection::build_scop_breadth (sese_l s1, loop_p loop)
+void

Re: [PATCH][GRAPHITE] More TLC

2017-09-25 Thread Richard Biener
On Fri, 22 Sep 2017, Sebastian Pop wrote:

> On Fri, Sep 22, 2017 at 8:03 AM, Richard Biener  wrote:
> 
> >
> > This simplifies canonicalize_loop_closed_ssa and does other minimal
> > TLC.  It also adds a testcase I reduced from a stupid mistake I made
> > when reworking canonicalize_loop_closed_ssa.
> >
> > Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.
> >
> > SPEC CPU 2006 is happy with it, current statistics on x86_64 with
> > -Ofast -march=haswell -floop-nest-optimize are
> >
> >  61 loop nests "optimized"
> >  45 loop nest transforms cancelled because of code generation issues
> >  21 loop nest optimizations timed out the 35 ISL "operations" we allow
> >
> > I say "optimized" because the usual transform I've seen is static tiling
> > as enforced by GRAPHITE according to --param loop-block-tile-size.
> > There's no way to automagically figure what kind of transform ISL did
> >
> 
> Here is how to automate (without magic) the detection
> of the transform that isl did.
> 
> The problem solved by isl is the minimization of strides
> in memory, and to do this, we need to tell the isl scheduler
> the validity dependence graph, in graphite-optimize-isl.c
> see the validity (RAW, WAR, WAW) and the proximity
> (RAR + validity) maps.  The proximity does include the
> read after read, as the isl scheduler needs to minimize
> strides between consecutive reads.
> 
> When you apply the schedule to the dependence graph,
> one can tell from the result the strides in memory, a good
> way to say whether a transform was beneficial is to sum up
> all memory strides, and make sure that the sum of all strides
> decreases after transform.  We could add a printf with the
> sum of strides before and after transforms, and have the
> testcases check for that.

Interesting.  Can you perhaps show me in code how to do that?

Thanks,
Richard.


Re: [PATCH][GRAPHITE] More TLC

2017-09-25 Thread Richard Biener
On Mon, 25 Sep 2017, Bin.Cheng wrote:

> On Mon, Sep 25, 2017 at 1:46 PM, Richard Biener  wrote:
> > On Mon, 25 Sep 2017, Richard Biener wrote:
> >
> >> On Fri, 22 Sep 2017, Richard Biener wrote:
> >>
> >> >
> >> > This simplifies canonicalize_loop_closed_ssa and does other minimal
> >> > TLC.  It also adds a testcase I reduced from a stupid mistake I made
> >> > when reworking canonicalize_loop_closed_ssa.
> >> >
> >> > Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.
> >> >
> >> > SPEC CPU 2006 is happy with it, current statistics on x86_64 with
> >> > -Ofast -march=haswell -floop-nest-optimize are
> >> >
> >> >  61 loop nests "optimized"
> >> >  45 loop nest transforms cancelled because of code generation issues
> >> >  21 loop nest optimizations timed out the 35 ISL "operations" we 
> >> > allow
> >>
> >> Overall compile time (with -j6) is 695 sec. w/o -floop-nest-optimize
> >> and 709 sec. with (this was with release checking).
> >>
> >> A single-run has 416.gamess (580s -> 618s),
> >> 436.cactusADM (206s -> 182s), 437.leslie3d (228s ->218s),
> >> 450.soplex (229s -> 226s), 465.tonto (428s -> 425s), 401.bzip2 (383s ->
> >> 379s), 462.libquantum (352s -> 343s), ignoring +-2s changes.  Will
> >> do a 3-run for those to confirm (it would be only a single regression
> >> for 416.gamess).
> >
> > 416.gamess regression confirmed, 450.soplex improvement as well,
> 436/437 improvements?  450.soplex (229s -> 226s) loops like noise.

base is with -floop-nest-optimize, peak without.

416.gamess  19580619   31.7 S   19580576   
34.0 *
416.gamess  19580614   31.9 S   19580577   
33.9 S
416.gamess  19580618   31.7 *   19580576   
34.0 S
436.cactusADM   11950194   61.5 S   11950204   
58.5 S
436.cactusADM   11950184   65.0 S   11950187   
63.8 *
436.cactusADM   11950186   64.1 *   11950186   
64.1 S
437.leslie3d 9400219   43.0 S9400218   
43.1 S
437.leslie3d 9400219   43.0 *9400223   
42.1 S
437.leslie3d 9400218   43.0 S9400223   
42.2 *
450.soplex   8340225   37.0 S8340231   
36.1 S
450.soplex   8340226   36.9 *8340230   
36.3 *
450.soplex   8340227   36.8 S8340229   
36.4 S
465.tonto9840426   23.1 S9840427   
23.0 *
465.tonto9840424   23.2 S9840430   
22.9 S
465.tonto9840425   23.2 *9840425   
23.2 S
401.bzip29650379   25.5 S9650378   
25.5 S
401.bzip29650379   25.5 *9650380   
25.4 *
401.bzip29650379   25.5 S9650380   
25.4 S
462.libquantum  20720351   59.0 *   20720349   
59.4 S
462.libquantum  20720351   59.0 S   20720345   
60.1 *
462.libquantum  20720352   58.8 S   20720344   
60.2 S



> Thanks,
> bin
> > in the three-run 462.libquantum regresses (344s -> 351s) so I suppose
> > that's noise.
> >
> > Richard.
> 
> 

-- 
Richard Biener 
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 
21284 (AG Nuernberg)


Re: [PATCH][GRAPHITE] More TLC

2017-09-25 Thread Bin.Cheng
On Mon, Sep 25, 2017 at 1:46 PM, Richard Biener  wrote:
> On Mon, 25 Sep 2017, Richard Biener wrote:
>
>> On Fri, 22 Sep 2017, Richard Biener wrote:
>>
>> >
>> > This simplifies canonicalize_loop_closed_ssa and does other minimal
>> > TLC.  It also adds a testcase I reduced from a stupid mistake I made
>> > when reworking canonicalize_loop_closed_ssa.
>> >
>> > Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.
>> >
>> > SPEC CPU 2006 is happy with it, current statistics on x86_64 with
>> > -Ofast -march=haswell -floop-nest-optimize are
>> >
>> >  61 loop nests "optimized"
>> >  45 loop nest transforms cancelled because of code generation issues
>> >  21 loop nest optimizations timed out the 35 ISL "operations" we allow
>>
>> Overall compile time (with -j6) is 695 sec. w/o -floop-nest-optimize
>> and 709 sec. with (this was with release checking).
>>
>> A single-run has 416.gamess (580s -> 618s),
>> 436.cactusADM (206s -> 182s), 437.leslie3d (228s ->218s),
>> 450.soplex (229s -> 226s), 465.tonto (428s -> 425s), 401.bzip2 (383s ->
>> 379s), 462.libquantum (352s -> 343s), ignoring +-2s changes.  Will
>> do a 3-run for those to confirm (it would be only a single regression
>> for 416.gamess).
>
> 416.gamess regression confirmed, 450.soplex improvement as well,
436/437 improvements?  450.soplex (229s -> 226s) loops like noise.

Thanks,
bin
> in the three-run 462.libquantum regresses (344s -> 351s) so I suppose
> that's noise.
>
> Richard.


Re: [PATCH][GRAPHITE] More TLC

2017-09-25 Thread Richard Biener
On Mon, 25 Sep 2017, Richard Biener wrote:

> On Fri, 22 Sep 2017, Richard Biener wrote:
> 
> > 
> > This simplifies canonicalize_loop_closed_ssa and does other minimal
> > TLC.  It also adds a testcase I reduced from a stupid mistake I made
> > when reworking canonicalize_loop_closed_ssa.
> > 
> > Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.
> > 
> > SPEC CPU 2006 is happy with it, current statistics on x86_64 with
> > -Ofast -march=haswell -floop-nest-optimize are
> > 
> >  61 loop nests "optimized"
> >  45 loop nest transforms cancelled because of code generation issues
> >  21 loop nest optimizations timed out the 35 ISL "operations" we allow
> 
> Overall compile time (with -j6) is 695 sec. w/o -floop-nest-optimize
> and 709 sec. with (this was with release checking).
> 
> A single-run has 416.gamess (580s -> 618s),
> 436.cactusADM (206s -> 182s), 437.leslie3d (228s ->218s),
> 450.soplex (229s -> 226s), 465.tonto (428s -> 425s), 401.bzip2 (383s -> 
> 379s), 462.libquantum (352s -> 343s), ignoring +-2s changes.  Will
> do a 3-run for those to confirm (it would be only a single regression
> for 416.gamess).

416.gamess regression confirmed, 450.soplex improvement as well,
in the three-run 462.libquantum regresses (344s -> 351s) so I suppose
that's noise.

Richard.


[PATCH][GRAPHITE] More -fopt-info, do not abort from ISL

2017-09-25 Thread Richard Biener

The following also dumps if the optimized schedule is equal to the
original one.  It also makes all ISL operations (well, nearly) not
abort on errors but instead propagate errors upward.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.

Richard.

2017-09-25  Richard Biener  

* graphite-optimize-isl.c (optimize_isl): Fail and dump if
ISL errors other than isl_error_quota happen.  Dump if the
schedule is the same.
* graphite-sese-to-poly.c (build_poly_scop): Fail on ISL
errors instead of aborting inside ISL.

Index: gcc/graphite-optimize-isl.c
===
--- gcc/graphite-optimize-isl.c (revision 253134)
+++ gcc/graphite-optimize-isl.c (working copy)
@@ -111,6 +111,7 @@ scop_get_domains (scop_p scop)
 static bool
 optimize_isl (scop_p scop)
 {
+  int old_err = isl_options_get_on_error (scop->isl_context);
   int old_max_operations = isl_ctx_get_max_operations (scop->isl_context);
   int max_operations = PARAM_VALUE (PARAM_MAX_ISL_OPERATIONS);
   if (max_operations)
@@ -150,19 +151,23 @@ optimize_isl (scop_p scop)
   scop->transformed_schedule =
 isl_schedule_map_schedule_node_bottom_up (scop->transformed_schedule,
  get_schedule_for_node_st, NULL);
-  isl_options_set_on_error (scop->isl_context, ISL_ON_ERROR_ABORT);
 
+  isl_options_set_on_error (scop->isl_context, old_err);
   isl_ctx_reset_operations (scop->isl_context);
   isl_ctx_set_max_operations (scop->isl_context, old_max_operations);
   if (!scop->transformed_schedule
-  || isl_ctx_last_error (scop->isl_context) == isl_error_quota)
+  || isl_ctx_last_error (scop->isl_context) != isl_error_none)
 {
   location_t loc = find_loop_location
(scop->scop_info->region.entry->dest->loop_father);
-  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loc,
-  "loop nest not optimized, optimization timed out "
-  "after %d operations [--param max-isl-operations]\n",
-  max_operations);
+  if (isl_ctx_last_error (scop->isl_context) == isl_error_quota)
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, loc,
+"loop nest not optimized, optimization timed out "
+"after %d operations [--param max-isl-operations]\n",
+max_operations);
+  else
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, loc,
+"loop nest not optimized, ISL signalled an error\n");
   return false;
 }
 
@@ -175,12 +180,13 @@ optimize_isl (scop_p scop)
 
   if (same_schedule)
 {
+  location_t loc = find_loop_location
+   (scop->scop_info->region.entry->dest->loop_father);
+  dump_printf_loc (MSG_NOTE, loc,
+  "loop nest not optimized, optimized schedule is "
+  "identical to original schedule\n");
   if (dump_file)
-   {
- fprintf (dump_file, "[scheduler] isl optimized schedule is "
-  "identical to the original schedule.\n");
- print_schedule_ast (dump_file, scop->original_schedule, scop);
-   }
+   print_schedule_ast (dump_file, scop->original_schedule, scop);
   isl_schedule_free (scop->transformed_schedule);
   scop->transformed_schedule = isl_schedule_copy (scop->original_schedule);
   return false;
Index: gcc/graphite-sese-to-poly.c
===
--- gcc/graphite-sese-to-poly.c (revision 253134)
+++ gcc/graphite-sese-to-poly.c (working copy)
@@ -1244,6 +1244,9 @@ build_original_schedule (scop_p scop)
 bool
 build_poly_scop (scop_p scop)
 {
+  int old_err = isl_options_get_on_error (scop->isl_context);
+  isl_options_set_on_error (scop->isl_context, ISL_ON_ERROR_CONTINUE);
+
   build_scop_context (scop);
 
   unsigned i = 0;
@@ -1253,6 +1256,14 @@ build_poly_scop (scop_p scop)
 
   build_scop_drs (scop);
   build_original_schedule (scop);
-  return true;
+
+  enum isl_error err = isl_ctx_last_error (scop->isl_context);
+  isl_ctx_reset_error (scop->isl_context);
+  isl_options_set_on_error (scop->isl_context, old_err);
+  if (err != isl_error_none)
+dump_printf (MSG_MISSED_OPTIMIZATION,
+"ISL error while building poly scop\n");
+
+  return err == isl_error_none;
 }
 #endif  /* HAVE_isl */


Re: [PATCH][GRAPHITE] More TLC

2017-09-25 Thread Richard Biener
On Fri, 22 Sep 2017, Richard Biener wrote:

> 
> This simplifies canonicalize_loop_closed_ssa and does other minimal
> TLC.  It also adds a testcase I reduced from a stupid mistake I made
> when reworking canonicalize_loop_closed_ssa.
> 
> Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.
> 
> SPEC CPU 2006 is happy with it, current statistics on x86_64 with
> -Ofast -march=haswell -floop-nest-optimize are
> 
>  61 loop nests "optimized"
>  45 loop nest transforms cancelled because of code generation issues
>  21 loop nest optimizations timed out the 35 ISL "operations" we allow

Overall compile time (with -j6) is 695 sec. w/o -floop-nest-optimize
and 709 sec. with (this was with release checking).

A single-run has 416.gamess (580s -> 618s),
436.cactusADM (206s -> 182s), 437.leslie3d (228s ->218s),
450.soplex (229s -> 226s), 465.tonto (428s -> 425s), 401.bzip2 (383s -> 
379s), 462.libquantum (352s -> 343s), ignoring +-2s changes.  Will
do a 3-run for those to confirm (it would be only a single regression
for 416.gamess).

Sofar I'm positively surprised given the limitations (and inefficiencies)
I know.

I'll add some more opt-info stuff to assess the number of SCOPs we
detect but discard during further analysis and the number of transforms
we cancel because they turn out as a no-op.

Richard.

>  {
> -  if (dump_file && dump_flags)
> - fprintf (dump_file, "isl timed out --param max-isl-operations=%d\n",
> -  max_operations);
> +  location_t loc = find_loop_location
> + (scop->scop_info->region.entry->dest->loop_father);
> +  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loc,
> +"loop nest not optimized, optimization timed out "
> +"after %d operations [--param max-isl-operations]\n",
> +max_operations);
>return false;
>  }
>  
> Index: gcc/graphite.c
> ===
> --- gcc/graphite.c(revision 253091)
> +++ gcc/graphite.c(working copy)
> @@ -293,86 +293,6 @@ free_scops (vec scops)
>scops.release ();
>  }
>  
> -/* Returns true when P1 and P2 are close phis with the same
> -   argument.  */
> -
> -static inline bool
> -same_close_phi_node (gphi *p1, gphi *p2)
> -{
> -  return (types_compatible_p (TREE_TYPE (gimple_phi_result (p1)),
> -   TREE_TYPE (gimple_phi_result (p2)))
> -   && operand_equal_p (gimple_phi_arg_def (p1, 0),
> -   gimple_phi_arg_def (p2, 0), 0));
> -}
> -
> -static void make_close_phi_nodes_unique (basic_block bb);
> -
> -/* Remove the close phi node at GSI and replace its rhs with the rhs
> -   of PHI.  */
> -
> -static void
> -remove_duplicate_close_phi (gphi *phi, gphi_iterator *gsi)
> -{
> -  gimple *use_stmt;
> -  use_operand_p use_p;
> -  imm_use_iterator imm_iter;
> -  tree res = gimple_phi_result (phi);
> -  tree def = gimple_phi_result (gsi->phi ());
> -
> -  gcc_assert (same_close_phi_node (phi, gsi->phi ()));
> -
> -  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
> -{
> -  FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
> - SET_USE (use_p, res);
> -
> -  update_stmt (use_stmt);
> -
> -  /* It is possible that we just created a duplicate close-phi
> -  for an already-processed containing loop.  Check for this
> -  case and clean it up.  */
> -  if (gimple_code (use_stmt) == GIMPLE_PHI
> -   && gimple_phi_num_args (use_stmt) == 1)
> - make_close_phi_nodes_unique (gimple_bb (use_stmt));
> -}
> -
> -  remove_phi_node (gsi, true);
> -}
> -
> -/* Removes all the close phi duplicates from BB.  */
> -
> -static void
> -make_close_phi_nodes_unique (basic_block bb)
> -{
> -  gphi_iterator psi;
> -
> -  for (psi = gsi_start_phis (bb); !gsi_end_p (psi); gsi_next ())
> -{
> -  gphi_iterator gsi = psi;
> -  gphi *phi = psi.phi ();
> -
> -  /* At this point, PHI should be a close phi in normal form.  */
> -  gcc_assert (gimple_phi_num_args (phi) == 1);
> -
> -  /* Iterate over the next phis and remove duplicates.  */
> -  gsi_next ();
> -  while (!gsi_end_p (gsi))
> - if (same_close_phi_node (phi, gsi.phi ()))
> -   remove_duplicate_close_phi (phi, );
> - else
> -   gsi_next ();
> -}
> -}
> -
> -/* Return true when NAME is defined in LOOP.  */
> -
> -static bool
> -defined_in_loop_p (tree name, loop_p loop)
> -{
> -  gcc_assert (TREE_CODE (name) == SSA_NAME);
> -  return loop == loop_containing_stmt (SSA_NAME_DEF_STMT (name));
> -}
> -
>  /* Transforms LOOP to the canonical loop closed SSA form.  */
>  
>  static void
> @@ -380,20 +300,22 @@ canonicalize_loop_closed_ssa (loop_p loo
>  {
>edge e = single_exit (loop);
>basic_block bb;
> +  gphi_iterator psi;
>  
>if (!e || (e->flags & EDGE_COMPLEX))
>  return;
>  
>bb = e->dest;
>  
> +  /* Make the loop-close PHI node BB contain only PHIs and have a
> + single predecessor.  */
>

Re: [PATCH][GRAPHITE] More TLC

2017-09-22 Thread Sebastian Pop
On Fri, Sep 22, 2017 at 8:03 AM, Richard Biener  wrote:

>
> This simplifies canonicalize_loop_closed_ssa and does other minimal
> TLC.  It also adds a testcase I reduced from a stupid mistake I made
> when reworking canonicalize_loop_closed_ssa.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.
>
> SPEC CPU 2006 is happy with it, current statistics on x86_64 with
> -Ofast -march=haswell -floop-nest-optimize are
>
>  61 loop nests "optimized"
>  45 loop nest transforms cancelled because of code generation issues
>  21 loop nest optimizations timed out the 35 ISL "operations" we allow
>
> I say "optimized" because the usual transform I've seen is static tiling
> as enforced by GRAPHITE according to --param loop-block-tile-size.
> There's no way to automagically figure what kind of transform ISL did
>

Here is how to automate (without magic) the detection
of the transform that isl did.

The problem solved by isl is the minimization of strides
in memory, and to do this, we need to tell the isl scheduler
the validity dependence graph, in graphite-optimize-isl.c
see the validity (RAW, WAR, WAW) and the proximity
(RAR + validity) maps.  The proximity does include the
read after read, as the isl scheduler needs to minimize
strides between consecutive reads.

When you apply the schedule to the dependence graph,
one can tell from the result the strides in memory, a good
way to say whether a transform was beneficial is to sum up
all memory strides, and make sure that the sum of all strides
decreases after transform.  We could add a printf with the
sum of strides before and after transforms, and have the
testcases check for that.

(usually none with the schedule identical check confused by FILTER
> stuff positioning).  This is also the issue with most GRAPHITE testcases.
> We can't really verify easily whether we performed loop interchange
> or not.  We can probably tell whether we applied loop fusion or
> splitting (by counting loops).
>
> I'm not aware of any remaining ICEs / wrong-code issues with GRAPHITE.
>
> I'm aware that the current "black-box" granularity hinders
> scheduling freedom (each GIMPLE BB is mapped to a ISL stmt, this
> is too coarse to schedule say two writes in a BB independently
> from each other).  Quick experiments could be done by simply
> splitting gimple BBs at some points.
>
> I'm aware that the SCOP detection algorithm assumes that it can
> walk loop->next and find loops "in order" -- but while that's
> true for the initial flow_loops_find result (DFS walk) it isn't
> true for any later created / discovered loops.  Sorting of
> loop siblings in DFS order should be easy (and a general cfgloopanal
> helper).
>
> Richard.
>
> 2017-09-22  Richard Biener  
>
> * graphite-isl-ast-to-gimple.c (graphite_verify): Inline into
> single caller.
> (graphite_regenerate_ast_isl): Do not reset SCEV.  Move debug
> print of no dependency loops ...
> * graphite.c (graphite_transform_loops): ... here.
> (canonicalize_loop_closed_ssa_form): Work from inner to outer
> loops.
> (same_close_phi_node, remove_duplicate_close_phi,
> make_close_phi_nodes_unique, defined_in_loop_p): Fold into ...
> (canonicalize_loop_closed_ssa): ... here and simplify.
> * graphite-optimize-isl.c: Include tree-vectorizer.h.
> (optimize_isl): Use dump_printf_loc to tell when we stopped
> optimizing because of an ISL timeout.
>
> * gcc.dg/graphite/scop-24.c: New testcase.
>
>
The change looks good to me.

Thanks,
Sebastian


Re: [PATCH][GRAPHITE] Simplify move_sese_in_condition

2017-09-22 Thread Sebastian Pop
On Fri, Sep 22, 2017 at 4:37 AM, Richard Biener  wrote:

>
> This re-implements it avoding the need to recompute dominators and in
> a much simpler way.
>
> Bootstrapped on x86_64-unknown-linux-gnu, testing in progress, SPEC CPU
> 2006 is happy.
>
> Richard.
>
> 2017-09-22  Richard Biener  
>
> * sese.c: Include cfganal.h.
> (if_region_set_false_region): Remove.
> (create_if_region_on_edge): Likewise.
> (move_sese_in_condition): Re-implement without destroying
> dominators.
>

This is an excellent cleanup.  Thanks!

Sebastian


[PATCH][GRAPHITE] More TLC

2017-09-22 Thread Richard Biener

This simplifies canonicalize_loop_closed_ssa and does other minimal
TLC.  It also adds a testcase I reduced from a stupid mistake I made
when reworking canonicalize_loop_closed_ssa.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.

SPEC CPU 2006 is happy with it, current statistics on x86_64 with
-Ofast -march=haswell -floop-nest-optimize are

 61 loop nests "optimized"
 45 loop nest transforms cancelled because of code generation issues
 21 loop nest optimizations timed out the 35 ISL "operations" we allow

I say "optimized" because the usual transform I've seen is static tiling
as enforced by GRAPHITE according to --param loop-block-tile-size.
There's no way to automagically figure what kind of transform ISL did
(usually none with the schedule identical check confused by FILTER
stuff positioning).  This is also the issue with most GRAPHITE testcases.
We can't really verify easily whether we performed loop interchange
or not.  We can probably tell whether we applied loop fusion or
splitting (by counting loops).

I'm not aware of any remaining ICEs / wrong-code issues with GRAPHITE.

I'm aware that the current "black-box" granularity hinders
scheduling freedom (each GIMPLE BB is mapped to a ISL stmt, this
is too coarse to schedule say two writes in a BB independently
from each other).  Quick experiments could be done by simply
splitting gimple BBs at some points.

I'm aware that the SCOP detection algorithm assumes that it can
walk loop->next and find loops "in order" -- but while that's
true for the initial flow_loops_find result (DFS walk) it isn't
true for any later created / discovered loops.  Sorting of
loop siblings in DFS order should be easy (and a general cfgloopanal
helper).

Richard.

2017-09-22  Richard Biener  

* graphite-isl-ast-to-gimple.c (graphite_verify): Inline into
single caller.
(graphite_regenerate_ast_isl): Do not reset SCEV.  Move debug
print of no dependency loops ...
* graphite.c (graphite_transform_loops): ... here.
(canonicalize_loop_closed_ssa_form): Work from inner to outer
loops.
(same_close_phi_node, remove_duplicate_close_phi,
make_close_phi_nodes_unique, defined_in_loop_p): Fold into ...
(canonicalize_loop_closed_ssa): ... here and simplify.
* graphite-optimize-isl.c: Include tree-vectorizer.h.
(optimize_isl): Use dump_printf_loc to tell when we stopped
optimizing because of an ISL timeout.

* gcc.dg/graphite/scop-24.c: New testcase.

Index: gcc/graphite-isl-ast-to-gimple.c
===
--- gcc/graphite-isl-ast-to-gimple.c(revision 253091)
+++ gcc/graphite-isl-ast-to-gimple.c(working copy)
@@ -73,15 +73,6 @@ struct ast_build_info
   bool is_parallelizable;
 };
 
-/* Verifies properties that GRAPHITE should maintain during translation.  */
-
-static inline void
-graphite_verify (void)
-{
-  checking_verify_loop_structure ();
-  checking_verify_loop_closed_ssa (true);
-}
-
 /* IVS_PARAMS maps isl's scattering and parameter identifiers
to corresponding trees.  */
 
@@ -2997,8 +2988,9 @@ graphite_regenerate_ast_isl (scop_p scop
  delete_loop (loop);
 }
 
-  graphite_verify ();
-  scev_reset ();
+  /* Verifies properties that GRAPHITE should maintain during translation.  */
+  checking_verify_loop_structure ();
+  checking_verify_loop_closed_ssa (true);
 
   free (if_region->true_region);
   free (if_region->region);
@@ -3008,19 +3000,6 @@ graphite_regenerate_ast_isl (scop_p scop
   isl_ast_node_free (root_node);
   timevar_pop (TV_GRAPHITE_CODE_GEN);
 
-  if (dump_file && (dump_flags & TDF_DETAILS))
-{
-  loop_p loop;
-  int num_no_dependency = 0;
-
-  FOR_EACH_LOOP (loop, 0)
-   if (loop->can_be_parallel)
- num_no_dependency++;
-
-  fprintf (dump_file, "%d loops carried no dependency.\n",
-  num_no_dependency);
-}
-
   return !t.codegen_error_p ();
 }
 
Index: gcc/graphite-optimize-isl.c
===
--- gcc/graphite-optimize-isl.c (revision 253091)
+++ gcc/graphite-optimize-isl.c (working copy)
@@ -37,6 +37,7 @@ along with GCC; see the file COPYING3.
 #include "tree-data-ref.h"
 #include "params.h"
 #include "dumpfile.h"
+#include "tree-vectorizer.h"
 #include "graphite.h"
 
 
@@ -156,9 +157,12 @@ optimize_isl (scop_p scop)
   if (!scop->transformed_schedule
   || isl_ctx_last_error (scop->isl_context) == isl_error_quota)
 {
-  if (dump_file && dump_flags)
-   fprintf (dump_file, "isl timed out --param max-isl-operations=%d\n",
-max_operations);
+  location_t loc = find_loop_location
+   (scop->scop_info->region.entry->dest->loop_father);
+  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loc,
+  "loop nest not optimized, optimization timed out "
+  "after %d 

[PATCH][GRAPHITE] Simplify move_sese_in_condition

2017-09-22 Thread Richard Biener

This re-implements it avoding the need to recompute dominators and in
a much simpler way.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress, SPEC CPU 
2006 is happy.

Richard.

2017-09-22  Richard Biener  

* sese.c: Include cfganal.h.
(if_region_set_false_region): Remove.
(create_if_region_on_edge): Likewise.
(move_sese_in_condition): Re-implement without destroying
dominators.

Index: gcc/sese.c
===
--- gcc/sese.c  (revision 253090)
+++ gcc/sese.c  (working copy)
@@ -40,8 +40,9 @@ along with GCC; see the file COPYING3.
 #include "cfgloop.h"
 #include "tree-data-ref.h"
 #include "tree-scalar-evolution.h"
-#include "sese.h"
 #include "tree-ssa-propagate.h"
+#include "cfganal.h"
+#include "sese.h"
 
 /* For a USE in BB, if BB is outside REGION, mark the USE in the
LIVEOUTS set.  */
@@ -333,99 +334,6 @@ get_false_edge_from_guard_bb (basic_bloc
   return NULL;
 }
 
-/* Sets the false region of an IF_REGION to REGION.  */
-
-static void
-if_region_set_false_region (ifsese if_region, sese_info_p region)
-{
-  free_dominance_info (CDI_DOMINATORS);
-
-  basic_block condition = if_region_get_condition_block (if_region);
-  edge false_edge = get_false_edge_from_guard_bb (condition);
-  basic_block dummy = false_edge->dest;
-  edge entry_region = region->region.entry;
-  edge exit_region = region->region.exit;
-  basic_block before_region = entry_region->src;
-  basic_block last_in_region = exit_region->src;
-  hashval_t hash = htab_hash_pointer (exit_region);
-  loop_exit **slot
-= current_loops->exits->find_slot_with_hash (exit_region, hash, NO_INSERT);
-  bool latch_p
-= exit_region->dest->loop_father->latch == exit_region->src;
-
-  entry_region->flags = false_edge->flags;
-  false_edge->flags = exit_region->flags;
-
-  redirect_edge_pred (entry_region, condition);
-  redirect_edge_pred (exit_region, before_region);
-  redirect_edge_pred (false_edge, last_in_region);
-  redirect_edge_succ (false_edge, single_succ (dummy));
-  delete_basic_block (dummy);
-
-  exit_region->flags = EDGE_FALLTHRU;
-
-  region->region.exit = false_edge;
-
-  free (if_region->false_region);
-  if_region->false_region = region;
-
-  if (slot)
-{
-  struct loop_exit *loop_exit = ggc_cleared_alloc ();
-
-  memcpy (loop_exit, *((struct loop_exit **) slot),
- sizeof (struct loop_exit));
-  current_loops->exits->clear_slot (slot);
-
-  hashval_t hash = htab_hash_pointer (false_edge);
-  slot = current_loops->exits->find_slot_with_hash (false_edge, hash,
-   INSERT);
-  loop_exit->e = false_edge;
-  *slot = loop_exit;
-  false_edge->src->loop_father->exits->next = loop_exit;
-}
-  if (latch_p)
-exit_region->dest->loop_father->latch = before_region;
-
-  calculate_dominance_info (CDI_DOMINATORS);
-}
-
-/* Creates an IFSESE with CONDITION on edge ENTRY.  */
-
-static ifsese
-create_if_region_on_edge (edge entry, tree condition)
-{
-  edge e;
-  edge_iterator ei;
-  sese_info_p sese_region = XNEW (struct sese_info_t);
-  sese_info_p true_region = XNEW (struct sese_info_t);
-  sese_info_p false_region = XNEW (struct sese_info_t);
-  ifsese if_region = XNEW (struct ifsese_s);
-  edge exit = create_empty_if_region_on_edge (entry, condition);
-
-  if_region->region = sese_region;
-  if_region->region->region.entry = entry;
-  if_region->region->region.exit = exit;
-
-  FOR_EACH_EDGE (e, ei, entry->dest->succs)
-{
-  if (e->flags & EDGE_TRUE_VALUE)
-   {
- true_region->region.entry = e;
- true_region->region.exit = single_succ_edge (e->dest);
- if_region->true_region = true_region;
-   }
-  else if (e->flags & EDGE_FALSE_VALUE)
-   {
- false_region->region.entry = e;
- false_region->region.exit = single_succ_edge (e->dest);
- if_region->false_region = false_region;
-   }
-}
-
-  return if_region;
-}
-
 /* Moves REGION in a condition expression:
| if (1)
|   ;
@@ -436,14 +344,32 @@ create_if_region_on_edge (edge entry, tr
 ifsese
 move_sese_in_condition (sese_info_p region)
 {
-  gcc_assert (! dom_info_available_p (cfun, CDI_POST_DOMINATORS));
+  basic_block region_entry_dest = region->region.entry->dest;
   basic_block pred_block = split_edge (region->region.entry);
-  ifsese if_region;
+  basic_block merge_block = split_edge (region->region.exit);
 
-  region->region.entry = single_succ_edge (pred_block);
-  if_region = create_if_region_on_edge (single_pred_edge (pred_block),
-   integer_one_node);
-  if_region_set_false_region (if_region, region);
+  edge true_edge = make_edge (pred_block, merge_block, EDGE_TRUE_VALUE);
+  edge false_edge = find_edge (pred_block, region_entry_dest);
+  false_edge->flags &= ~EDGE_FALLTHRU;
+  false_edge->flags |= EDGE_FALSE_VALUE;
+  

[PATCH][GRAPHITE] Strip down dominator recompute, checking and friends

2017-09-21 Thread Richard Biener

The following is a quick attempt at reducing pass overhead.  The
main part is maintaining post-dominators only during scop detection
and not recomputing / verifying everything many many times for no
good reason.

Somehow this means I ran into a latent bug where ISL split
a loop into two, not mixing loop and condition PHIs and thus
confusing translate_pending_phi_nodes.

I moved lc SSA "canonicalization" and cleaned it up a bit.  I think
it would be useful to factor the requirements into the general
version.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

The graphite testsuites and SPEC CPU 2006 don't show any new
issues.  I'll apply this when testing finished.

The immediate plan is to look at SCOP detection and make it
more intelligently look at loop->next for the breath build
given that the loop->next chain is not necessarily ordered
in the way it thinks.

Thanks,
Richard.

2017-09-21  Richard Biener  

* graphite-isl-ast-to-gimple.c (translate_pending_phi_nodes):
Verify both BBs contain loop PHI nodes before dispatching to
copy_loop_phi_args.
(graphite_regenerate_ast_isl): Do not recompute dominators,
do not verify three times.  Restructure for clarity.
* graphite-scop-detection.c (same_close_phi_node,
remove_duplicate_close_phi, make_close_phi_nodes_unique,
defined_in_loop_p, canonicalize_loop_closed_ssa,
canonicalize_loop_closed_ssa_form): Simplify, remove excess
checking and SSA rewrite, move to ...
* graphite.c: ... here.  Include ssa.h and tree-ssa-loop-manip.h.
(graphite_initialize): Do not pass in ctx, do not reset the
SCEV cache, compute only dominators.
(graphite_transform_loops): Allocate ISL ctx after
graphite_initialize.  Call canonicalize_loop_closed_ssa_form.
Maintain post-dominators only around build_scops.
* sese.c (if_region_set_false_region): Make static.  Free
and recompute dominators.
(move_sese_in_condition): Assert we don't get called with
post-dominators computed.
* sese.h (if_region_set_false_region): Remove.

Index: gcc/graphite-isl-ast-to-gimple.c
===
--- gcc/graphite-isl-ast-to-gimple.c(revision 253064)
+++ gcc/graphite-isl-ast-to-gimple.c(working copy)
@@ -2759,7 +2759,8 @@ translate_pending_phi_nodes ()
}
 
   auto_vec  iv_map;
-  if (bb_contains_loop_phi_nodes (new_bb))
+  if (bb_contains_loop_phi_nodes (new_bb)
+ && bb_contains_loop_phi_nodes (old_bb))
codegen_error = !copy_loop_phi_args (old_phi, ibp_old_bb, new_phi,
ibp_new_bb, false);
   else if (bb_contains_loop_close_phi_nodes (new_bb))
@@ -2941,12 +2942,8 @@ graphite_regenerate_ast_isl (scop_p scop
   print_isl_ast (dump_file, root_node);
 }
 
-  recompute_all_dominators ();
-  graphite_verify ();
-
   if_region = move_sese_in_condition (region);
   region->if_region = if_region;
-  recompute_all_dominators ();
 
   loop_p context_loop = region->region.entry->src->loop_father;
 
@@ -2960,45 +2957,28 @@ graphite_regenerate_ast_isl (scop_p scop
   region->if_region->true_region->region.exit = single_succ_edge (bb);
 
   t.translate_isl_ast (context_loop, root_node, e, ip);
+  if (! t.codegen_error_p ())
+t.translate_pending_phi_nodes ();
+  if (! t.codegen_error_p ())
+{
+  sese_insert_phis_for_liveouts (region,
+if_region->region->region.exit->src,
+if_region->false_region->region.exit,
+if_region->true_region->region.exit);
+  if (dump_file)
+   fprintf (dump_file, "[codegen] isl AST to Gimple succeeded.\n");
+
+  mark_virtual_operands_for_renaming (cfun);
+  update_ssa (TODO_update_ssa);
+}
+
   if (t.codegen_error_p ())
 {
   if (dump_file)
fprintf (dump_file, "codegen error: "
 "reverting back to the original code.\n");
   set_ifsese_condition (if_region, integer_zero_node);
-}
-  else
-{
-  t.translate_pending_phi_nodes ();
-  if (!t.codegen_error_p ())
-   {
- sese_insert_phis_for_liveouts (region,
-if_region->region->region.exit->src,
-if_region->false_region->region.exit,
-if_region->true_region->region.exit);
- mark_virtual_operands_for_renaming (cfun);
- update_ssa (TODO_update_ssa);
-
-
- graphite_verify ();
- scev_reset ();
- recompute_all_dominators ();
- graphite_verify ();
 
- if (dump_file)
-   fprintf (dump_file, "[codegen] isl AST to Gimple succeeded.\n");
-   }
-  else
-   {
- if (dump_file)
-   fprintf (dump_file, 

  1   2   3   >