On Wed, 2012-03-28 at 15:57 +0200, Richard Guenther wrote:
> On Tue, Mar 6, 2012 at 9:49 PM, William J. Schmidt
> <wschm...@linux.vnet.ibm.com> wrote:
> > Hi,
> >
> > This is a re-post of the patch I posted for comments in January to
> > address http://gcc.gnu.org/bugzilla/show_bug.cgi?id=18589.  The patch
> > modifies reassociation to expose repeated factors from __builtin_pow*
> > calls, optimally reassociate repeated factors, and possibly reconstitute
> > __builtin_powi calls from the results of reassociation.
> >
> > Bootstrapped and passes regression tests for powerpc64-linux-gnu.  I
> > expect there may need to be some small changes, but I am targeting this
> > for trunk approval.
> >
> > Thanks very much for the review,
> 
> Hmm.  How much work would it be to extend the reassoc 'IL' to allow
> a repeat factor per op?  I realize what you do is all within what reassoc
> already does though ideally we would not require any GIMPLE IL changes
> for building up / optimizing the reassoc IL but only do so when we commit
> changes.
> 
> Thanks,
> Richard.

Hi Richard,

I've revised my patch along these lines; see the new version below.
While testing it I realized I could do a better job of reducing the
number of multiplies, so there are some changes to that logic as well,
and a couple of additional test cases.  Regstrapped successfully on
powerpc64-linux.

Hope this looks better!

Thanks,
Bill


gcc:

2012-04-03  Bill Schmidt  <wschm...@linux.vnet.ibm.com>

        PR tree-optimization/18589
        * tree-pass.h: Replace pass_reassoc with pass_early_reassoc and
        pass_late_reassoc.
        * passes.c (init_optimization_passes): Change pass_reassoc calls to
        pass_early_reassoc and pass_late_reassoc.
        * tree-ssa-reassoc.c (reassociate_stats): Add two fields.
        (operand_entry): Add count field.
        (early_reassoc): New static var.
        (add_repeat_to_ops_vec): New function.
        (completely_remove_stmt): Likewise.
        (remove_def_if_absorbed_call): Likewise.
        (remove_visited_stmt_chain): Remove feeding builtin pow/powi calls.
        (acceptable_pow_call): New function.
        (linearize_expr_tree): Look for builtin pow/powi calls and add operand
        entries with repeat counts when found.
        (repeat_factor_d): New struct and associated typedefs.
        (repeat_factor_vec): New static vector variable.
        (compare_repeat_factors): New function.
        (get_reassoc_pow_ssa_name): Likewise.
        (attempt_builtin_powi): Likewise.
        (reassociate_bb): Attempt to create __builtin_powi calls, and multiply
        their results by any leftover reassociated factors; remove builtin
        pow/powi calls that were absorbed by reassociation.
        (fini_reassoc): Two new calls to statistics_counter_event.
        (execute_early_reassoc): New function.
        (execute_late_reassoc): Likewise.
        (pass_early_reassoc): Replace pass_reassoc, renamed to reassoc1,
        call execute_early_reassoc.
        (pass_late_reassoc): New gimple_opt_pass named reassoc2 that calls
        execute_late_reassoc.

gcc/testsuite:

2012-04-03  Bill Schmidt  <wschm...@linux.vnet.ibm.com>

        PR tree-optimization/18589
        * gcc.dg/pr46309.c: Change -fdump-tree-reassoc-details to
        -fdump-tree-reassoc[12]-details.
        * gcc.dg/tree-ssa/pr18589-1.c: New test.
        * gcc.dg/tree-ssa/pr18589-2.c: Likewise.
        * gcc.dg/tree-ssa/pr18589-3.c: Likewise.
        * gcc.dg/tree-ssa/pr18589-4.c: Likewise.
        * gcc.dg/tree-ssa/pr18589-5.c: Likewise.
        * gcc.dg/tree-ssa/pr18589-6.c: Likewise.
        * gcc.dg/tree-ssa/pr18589-7.c: Likewise.
        * gcc.dg/tree-ssa/pr18589-8.c: Likewise.
        * gcc.dg/tree-ssa/pr18589-9.c: Likewise.
        * gcc.dg/tree-ssa/pr18589-10.c: Likewise.


Index: gcc/tree-pass.h
===================================================================
--- gcc/tree-pass.h     (revision 186108)
+++ gcc/tree-pass.h     (working copy)
@@ -441,7 +441,8 @@ extern struct gimple_opt_pass pass_copy_prop;
 extern struct gimple_opt_pass pass_vrp;
 extern struct gimple_opt_pass pass_uncprop;
 extern struct gimple_opt_pass pass_return_slot;
-extern struct gimple_opt_pass pass_reassoc;
+extern struct gimple_opt_pass pass_early_reassoc;
+extern struct gimple_opt_pass pass_late_reassoc;
 extern struct gimple_opt_pass pass_rebuild_cgraph_edges;
 extern struct gimple_opt_pass pass_remove_cgraph_callee_edges;
 extern struct gimple_opt_pass pass_build_cgraph_edges;
Index: gcc/testsuite/gcc.dg/pr46309.c
===================================================================
--- gcc/testsuite/gcc.dg/pr46309.c      (revision 186108)
+++ gcc/testsuite/gcc.dg/pr46309.c      (working copy)
@@ -1,6 +1,6 @@
 /* PR tree-optimization/46309 */
 /* { dg-do compile } */
-/* { dg-options "-O2 -fdump-tree-reassoc-details" } */
+/* { dg-options "-O2 -fdump-tree-reassoc1-details 
-fdump-tree-reassoc2-details" } */
 /* The transformation depends on BRANCH_COST being greater than 1
    (see the notes in the PR), so try to force that.  */
 /* { dg-additional-options "-mtune=octeon2" { target mips*-*-* } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-4.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-4.c   (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-4.c   (revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y, double z, double u)
+{
+  return x * x * y * y * y * z * z * z * z * u;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 6 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-5.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-5.c   (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-5.c   (revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y, double z, double u)
+{
+  return x * x * x * y * y * y * z * z * z * z * u * u * u * u;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 6 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-6.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-6.c   (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-6.c   (revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y)
+{
+  return __builtin_pow (x, 3.0) * __builtin_pow (y, 4.0);
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 4 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-7.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-7.c   (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-7.c   (revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+float baz (float x, float y)
+{
+  return x * x * x * x * y * y * y * y;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-8.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-8.c   (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-8.c   (revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+long double baz (long double x, long double y)
+{
+  return x * x * x * x * y * y * y * y;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-9.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-9.c   (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-9.c   (revision 0)
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y, double z)
+{
+  return (__builtin_pow (x, 3.0) * __builtin_pow (y, 2.0)
+         * __builtin_pow (z, 5.0));
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 6 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-10.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-10.c  (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-10.c  (revision 0)
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y, double z)
+{
+  return (__builtin_pow (x, 4.0) * __builtin_pow (y, 2.0)
+         * __builtin_pow (z, 4.0));
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 5 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-1.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-1.c   (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-1.c   (revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y)
+{
+  return x * x * x * x * y * y * y * y;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-2.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-2.c   (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-2.c   (revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y)
+{
+  return x * y * y * x * y * x * x * y;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-3.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr18589-3.c   (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-3.c   (revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
+
+double baz (double x, double y, double z)
+{
+  return x * x * y * y * y * z * z * z * z;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 5 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
Index: gcc/passes.c
===================================================================
--- gcc/passes.c        (revision 186108)
+++ gcc/passes.c        (working copy)
@@ -1325,7 +1325,7 @@ init_optimization_passes (void)
         opportunities.  */
       NEXT_PASS (pass_phi_only_cprop);
       NEXT_PASS (pass_dse);
-      NEXT_PASS (pass_reassoc);
+      NEXT_PASS (pass_early_reassoc);
       NEXT_PASS (pass_dce);
       NEXT_PASS (pass_forwprop);
       NEXT_PASS (pass_phiopt);
@@ -1377,7 +1377,7 @@ init_optimization_passes (void)
        }
       NEXT_PASS (pass_lower_vector_ssa);
       NEXT_PASS (pass_cse_reciprocals);
-      NEXT_PASS (pass_reassoc);
+      NEXT_PASS (pass_late_reassoc);
       NEXT_PASS (pass_vrp);
       NEXT_PASS (pass_dominator);
       /* The only const/copy propagation opportunities left after
Index: gcc/tree-ssa-reassoc.c
===================================================================
--- gcc/tree-ssa-reassoc.c      (revision 186108)
+++ gcc/tree-ssa-reassoc.c      (working copy)
@@ -61,6 +61,10 @@ along with GCC; see the file COPYING3.  If not see
     3. Optimization of the operand lists, eliminating things like a +
     -a, a & a, etc.
 
+    3a. Combine repeated factors with the same occurrence counts
+    into a __builtin_powi call that will later be optimized into
+    an optimal number of multiplies.
+
     4. Rewrite the expression trees we linearized and optimized so
     they are in proper rank order.
 
@@ -169,6 +173,8 @@ static struct
   int constants_eliminated;
   int ops_eliminated;
   int rewritten;
+  int pows_encountered;
+  int pows_created;
 } reassociate_stats;
 
 /* Operator, rank pair.  */
@@ -177,6 +183,7 @@ typedef struct operand_entry
   unsigned int rank;
   int id;
   tree op;
+  unsigned int count;
 } *operand_entry_t;
 
 static alloc_pool operand_entry_pool;
@@ -190,6 +197,12 @@ static int next_operand_entry_id;
    depth.  */
 static long *bb_rank;
 
+/* Distinguish between early and late reassociation passes.  Early
+   reassociation is permitted to create __builtin_pow* calls.  This
+   is not done in late reassociation to preserve the builtin
+   optimizations performed in pass_cse_sincos.  */
+static bool early_reassoc;
+
 /* Operand->rank hashtable.  */
 static struct pointer_map_t *operand_rank;
 
@@ -515,9 +528,28 @@ add_to_ops_vec (VEC(operand_entry_t, heap) **ops,
   oe->op = op;
   oe->rank = get_rank (op);
   oe->id = next_operand_entry_id++;
+  oe->count = 1;
   VEC_safe_push (operand_entry_t, heap, *ops, oe);
 }
 
+/* Add an operand entry to *OPS for the tree operand OP with repeat
+   count REPEAT.  */
+
+static void
+add_repeat_to_ops_vec (VEC(operand_entry_t, heap) **ops, tree op,
+                      HOST_WIDE_INT repeat)
+{
+  operand_entry_t oe = (operand_entry_t) pool_alloc (operand_entry_pool);
+
+  oe->op = op;
+  oe->rank = get_rank (op);
+  oe->id = next_operand_entry_id++;
+  oe->count = repeat;
+  VEC_safe_push (operand_entry_t, heap, *ops, oe);
+
+  reassociate_stats.pows_encountered++;
+}
+
 /* Return true if STMT is reassociable operation containing a binary
    operation with tree code CODE, and is inside LOOP.  */
 
@@ -2049,6 +2081,32 @@ is_phi_for_stmt (gimple stmt, tree operand)
   return false;
 }
 
+/* Remove STMT, unlink its virtual defs, and release its SSA defs.  */
+
+static inline void
+completely_remove_stmt (gimple stmt)
+{
+  gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
+  gsi_remove (&gsi, true);
+  unlink_stmt_vdef (stmt);
+  release_defs (stmt);
+}
+
+/* If OP is defined by a builtin call that has been absorbed by
+   reassociation, remove its defining statement completely.  */
+
+static inline void
+remove_def_if_absorbed_call (tree op)
+{
+  gimple stmt;
+
+  if (TREE_CODE (op) == SSA_NAME
+      && has_zero_uses (op)
+      && is_gimple_call ((stmt = SSA_NAME_DEF_STMT (op)))
+      && gimple_visited_p (stmt))
+    completely_remove_stmt (stmt);
+}
+
 /* Remove def stmt of VAR if VAR has zero uses and recurse
    on rhs1 operand if so.  */
 
@@ -2057,19 +2115,31 @@ remove_visited_stmt_chain (tree var)
 {
   gimple stmt;
   gimple_stmt_iterator gsi;
+  tree var2;
 
   while (1)
     {
       if (TREE_CODE (var) != SSA_NAME || !has_zero_uses (var))
        return;
       stmt = SSA_NAME_DEF_STMT (var);
-      if (!is_gimple_assign (stmt)
-         || !gimple_visited_p (stmt))
+      if (is_gimple_assign (stmt) && gimple_visited_p (stmt))
+       {
+         var = gimple_assign_rhs1 (stmt);
+         var2 = gimple_assign_rhs2 (stmt);
+         gsi = gsi_for_stmt (stmt);
+         gsi_remove (&gsi, true);
+         release_defs (stmt);
+         /* A multiply whose operands are both fed by builtin pow/powi
+            calls must check whether to remove rhs2 as well.  */
+         remove_def_if_absorbed_call (var2);
+       }
+      else if (is_gimple_call (stmt) && gimple_visited_p (stmt))
+       {
+         completely_remove_stmt (stmt);
+         return;
+       }
+      else
        return;
-      var = gimple_assign_rhs1 (stmt);
-      gsi = gsi_for_stmt (stmt);
-      gsi_remove (&gsi, true);
-      release_defs (stmt);
     }
 }
 
@@ -2564,6 +2634,75 @@ break_up_subtract (gimple stmt, gimple_stmt_iterat
   update_stmt (stmt);
 }
 
+/* Determine whether STMT is a builtin call that raises an SSA name
+   to an integer power and has only one use.  If so, and this is early
+   reassociation and unsafe math optimizations are permitted, place
+   the SSA name in *BASE and the exponent in *EXPONENT, and return TRUE.
+   If any of these conditions does not hold, return FALSE.  */
+
+static bool
+acceptable_pow_call (gimple stmt, tree *base, HOST_WIDE_INT *exponent)
+{
+  tree fndecl, arg1;
+  REAL_VALUE_TYPE c, cint;
+
+  if (!early_reassoc
+      || !flag_unsafe_math_optimizations
+      || !is_gimple_call (stmt)
+      || !has_single_use (gimple_call_lhs (stmt)))
+    return false;
+
+  fndecl = gimple_call_fndecl (stmt);
+
+  if (!fndecl
+      || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
+    return false;
+
+  switch (DECL_FUNCTION_CODE (fndecl))
+    {
+    CASE_FLT_FN (BUILT_IN_POW):
+      *base = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+
+      if (TREE_CODE (arg1) != REAL_CST)
+       return false;
+
+      c = TREE_REAL_CST (arg1);
+
+      if (REAL_EXP (&c) > HOST_BITS_PER_WIDE_INT)
+       return false;
+
+      *exponent = real_to_integer (&c);
+      real_from_integer (&cint, VOIDmode, *exponent,
+                        *exponent < 0 ? -1 : 0, 0);
+      if (!real_identical (&c, &cint))
+       return false;
+
+      break;
+
+    CASE_FLT_FN (BUILT_IN_POWI):
+      *base = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+
+      if (!host_integerp (arg1, 0))
+       return false;
+
+      *exponent = TREE_INT_CST_LOW (arg1);
+      break;
+
+    default:
+      return false;
+    }
+
+  /* Expanding negative exponents is generally unproductive, so we don't
+     complicate matters with those.  Exponents of zero and one should
+     have been handled by expression folding.  */
+  if (*exponent < 2 || TREE_CODE (*base) != SSA_NAME)
+    return false;
+
+  return true;
+}
+
 /* Recursively linearize a binary expression that is the RHS of STMT.
    Place the operands of the expression tree in the vector named OPS.  */
 
@@ -2573,11 +2712,13 @@ linearize_expr_tree (VEC(operand_entry_t, heap) **
 {
   tree binlhs = gimple_assign_rhs1 (stmt);
   tree binrhs = gimple_assign_rhs2 (stmt);
-  gimple binlhsdef, binrhsdef;
+  gimple binlhsdef = NULL, binrhsdef = NULL;
   bool binlhsisreassoc = false;
   bool binrhsisreassoc = false;
   enum tree_code rhscode = gimple_assign_rhs_code (stmt);
   struct loop *loop = loop_containing_stmt (stmt);
+  tree base = NULL_TREE;
+  HOST_WIDE_INT exponent = 0;
 
   if (set_visited)
     gimple_set_visited (stmt, true);
@@ -2615,8 +2756,26 @@ linearize_expr_tree (VEC(operand_entry_t, heap) **
 
       if (!binrhsisreassoc)
        {
-         add_to_ops_vec (ops, binrhs);
-         add_to_ops_vec (ops, binlhs);
+         if (rhscode == MULT_EXPR
+             && TREE_CODE (binrhs) == SSA_NAME
+             && acceptable_pow_call (binrhsdef, &base, &exponent))
+           {
+             add_repeat_to_ops_vec (ops, base, exponent);
+             gimple_set_visited (binrhsdef, true);
+           }
+         else
+           add_to_ops_vec (ops, binrhs);
+
+         if (rhscode == MULT_EXPR
+             && TREE_CODE (binlhs) == SSA_NAME
+             && acceptable_pow_call (binlhsdef, &base, &exponent))
+           {
+             add_repeat_to_ops_vec (ops, base, exponent);
+             gimple_set_visited (binlhsdef, true);
+           }
+         else
+           add_to_ops_vec (ops, binlhs);
+
          return;
        }
 
@@ -2655,7 +2814,16 @@ linearize_expr_tree (VEC(operand_entry_t, heap) **
                                      rhscode, loop));
   linearize_expr_tree (ops, SSA_NAME_DEF_STMT (binlhs),
                       is_associative, set_visited);
-  add_to_ops_vec (ops, binrhs);
+
+  if (rhscode == MULT_EXPR
+      && TREE_CODE (binrhs) == SSA_NAME
+      && acceptable_pow_call (SSA_NAME_DEF_STMT (binrhs), &base, &exponent))
+    {
+      add_repeat_to_ops_vec (ops, base, exponent);
+      gimple_set_visited (SSA_NAME_DEF_STMT (binrhs), true);
+    }
+  else
+    add_to_ops_vec (ops, binrhs);
 }
 
 /* Repropagate the negates back into subtracts, since no other pass
@@ -2815,6 +2983,364 @@ break_up_subtract_bb (basic_block bb)
     break_up_subtract_bb (son);
 }
 
+/* Used for repeated factor analysis.  */
+struct repeat_factor_d
+{
+  /* An SSA name that occurs in a multiply chain.  */
+  tree factor;
+
+  /* Cached rank of the factor.  */
+  unsigned rank;
+
+  /* Number of occurrences of the factor in the chain.  */
+  HOST_WIDE_INT count;
+
+  /* An SSA name representing the product of this factor and
+     all factors appearing later in the repeated factor vector.  */
+  tree repr;
+};
+
+typedef struct repeat_factor_d repeat_factor, *repeat_factor_t;
+typedef const struct repeat_factor_d *const_repeat_factor_t;
+
+DEF_VEC_O (repeat_factor);
+DEF_VEC_ALLOC_O (repeat_factor, heap);
+
+static VEC (repeat_factor, heap) *repeat_factor_vec;
+
+/* Used for sorting the repeat factor vector.  Sort primarily by
+   ascending occurrence count, secondarily by descending rank.  */
+
+static int
+compare_repeat_factors (const void *x1, const void *x2)
+{
+  const_repeat_factor_t rf1 = (const_repeat_factor_t) x1;
+  const_repeat_factor_t rf2 = (const_repeat_factor_t) x2;
+
+  if (rf1->count != rf2->count)
+    return rf1->count - rf2->count;
+
+  return rf2->rank - rf1->rank;
+}
+
+/* Get a new SSA name for register variable *TARGET of type TYPE.
+   If *TARGET is null or incompatible with TYPE, create the variable
+   first.  */
+
+static tree
+get_reassoc_pow_ssa_name (tree *target, tree type)
+{
+  if (!*target || !types_compatible_p (type, TREE_TYPE (*target)))
+    {
+      *target = create_tmp_reg (type, "reassocpow");
+      add_referenced_var (*target);
+    }
+
+  return make_ssa_name (*target, NULL);
+}
+
+/* Look for repeated operands in OPS in the multiply tree rooted at
+   STMT.  Replace them with an optimal sequence of multiplies and powi
+   builtin calls, and remove the used operands from OPS.  Return an
+   SSA name representing the value of the replacement sequence.  */
+
+static tree
+attempt_builtin_powi (gimple stmt, VEC(operand_entry_t, heap) **ops,
+                     tree *target)
+{
+  unsigned i, j, vec_len;
+  int ii;
+  operand_entry_t oe;
+  repeat_factor_t rf1, rf2;
+  repeat_factor rfnew;
+  tree target_ssa, iter_result;
+  tree result = NULL_TREE;
+  tree type = TREE_TYPE (gimple_get_lhs (stmt));
+  tree powi_fndecl = mathfn_built_in (type, BUILT_IN_POWI);
+  gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
+  gimple mul_stmt, pow_stmt;
+
+  /* Nothing to do if BUILT_IN_POWI doesn't exist for this type and
+     target.  */
+  if (!powi_fndecl)
+    return NULL_TREE;
+
+  /* Allocate the repeated factor vector.  */
+  repeat_factor_vec = VEC_alloc (repeat_factor, heap, 10);
+
+  /* Scan the OPS vector for all SSA names in the product and build
+     up a vector of occurrence counts for each factor.  */
+  FOR_EACH_VEC_ELT (operand_entry_t, *ops, i, oe)
+    {
+      if (TREE_CODE (oe->op) == SSA_NAME)
+       {
+         FOR_EACH_VEC_ELT (repeat_factor, repeat_factor_vec, j, rf1)
+           {
+             if (rf1->factor == oe->op)
+               {
+                 rf1->count += oe->count;
+                 break;
+               }
+           }
+
+         if (j >= VEC_length (repeat_factor, repeat_factor_vec))
+           {
+             rfnew.factor = oe->op;
+             rfnew.rank = oe->rank;
+             rfnew.count = oe->count;
+             rfnew.repr = NULL_TREE;
+             VEC_safe_push (repeat_factor, heap, repeat_factor_vec, &rfnew);
+           }
+       }
+    }
+
+  /* Sort the repeated factor vector by (a) increasing occurrence count,
+     and (b) decreasing rank.  */
+  VEC_qsort (repeat_factor, repeat_factor_vec, compare_repeat_factors);
+
+  /* It is generally best to combine as many base factors as possible
+     into a product before applying __builtin_powi to the result.
+     However, the sort order chosen for the repeated factor vector
+     allows us to cache partial results for the product of the base
+     factors for subsequent use.  When we already have a cached partial
+     result from a previous iteration, it is best to make use of it
+     before looking for another __builtin_pow opportunity.
+
+     As an example, consider x * x * y * y * y * z * z * z * z.
+     We want to first compose the product x * y * z, raise it to the
+     second power, then multiply this by y * z, and finally multiply
+     by z.  This can be done in 5 multiplies provided we cache y * z
+     for use in both expressions:
+
+        t1 = y * z
+       t2 = t1 * x
+       t3 = t2 * t2
+       t4 = t1 * t3
+       result = t4 * z
+
+     If we instead ignored the cached y * z and first multiplied by
+     the __builtin_pow opportunity z * z, we would get the inferior:
+
+        t1 = y * z
+       t2 = t1 * x
+       t3 = t2 * t2
+       t4 = z * z
+       t5 = t3 * t4
+        result = t5 * y  */
+
+  vec_len = VEC_length (repeat_factor, repeat_factor_vec);
+  
+  /* Repeatedly look for opportunities to create a builtin_powi call.  */
+  while (true)
+    {
+      HOST_WIDE_INT power;
+
+      /* First look for the largest cached product of factors from
+        preceding iterations.  If found, create a builtin_powi for
+        it if the minimum occurrence count for its factors is at
+        least 2, or just use this cached product as our next 
+        multiplicand if the minimum occurrence count is 1.  */
+      FOR_EACH_VEC_ELT (repeat_factor, repeat_factor_vec, j, rf1)
+       {
+         if (rf1->repr && rf1->count > 0)
+           break;
+       }
+
+      if (j < vec_len)
+       {
+         power = rf1->count;
+
+         if (power == 1)
+           {
+             iter_result = rf1->repr;
+
+             if (dump_file && (dump_flags & TDF_DETAILS))
+               {
+                 unsigned elt;
+                 repeat_factor_t rf;
+                 fputs ("Multiplying by cached product ", dump_file);
+                 for (elt = j; elt < vec_len; elt++)
+                   {
+                     rf = VEC_index (repeat_factor, repeat_factor_vec, elt);
+                     print_generic_expr (dump_file, rf->factor, 0);
+                     if (elt < vec_len - 1)
+                       fputs (" * ", dump_file);
+                   }
+                 fputs ("\n", dump_file);
+               }
+           }
+         else
+           {
+             iter_result = get_reassoc_pow_ssa_name (target, type);
+             pow_stmt = gimple_build_call (powi_fndecl, 2, rf1->repr, 
+                                           build_int_cst (integer_type_node,
+                                                          power));
+             gimple_call_set_lhs (pow_stmt, iter_result);
+             gimple_set_location (pow_stmt, gimple_location (stmt));
+             gsi_insert_before (&gsi, pow_stmt, GSI_SAME_STMT);
+
+             if (dump_file && (dump_flags & TDF_DETAILS))
+               {
+                 unsigned elt;
+                 repeat_factor_t rf;
+                 fputs ("Building __builtin_pow call for cached product (",
+                        dump_file);
+                 for (elt = j; elt < vec_len; elt++)
+                   {
+                     rf = VEC_index (repeat_factor, repeat_factor_vec, elt);
+                     print_generic_expr (dump_file, rf->factor, 0);
+                     if (elt < vec_len - 1)
+                       fputs (" * ", dump_file);
+                   }
+                 fprintf (dump_file, ")^%ld\n", power);
+               }
+           }
+       }
+      else
+       {
+         /* Otherwise, find the first factor in the repeated factor
+            vector whose occurrence count is at least 2.  If no such
+            factor exists, there are no builtin_powi opportunities
+            remaining.  */
+         FOR_EACH_VEC_ELT (repeat_factor, repeat_factor_vec, j, rf1)
+           {
+             if (rf1->count >= 2)
+               break;
+           }
+
+         if (j >= vec_len)
+           break;
+
+         power = rf1->count;
+
+         if (dump_file && (dump_flags & TDF_DETAILS))
+           {
+             unsigned elt;
+             repeat_factor_t rf;
+             fputs ("Building __builtin_pow call for (", dump_file);
+             for (elt = j; elt < vec_len; elt++)
+               {
+                 rf = VEC_index (repeat_factor, repeat_factor_vec, elt);
+                 print_generic_expr (dump_file, rf->factor, 0);
+                 if (elt < vec_len - 1)
+                   fputs (" * ", dump_file);
+               }
+             fprintf (dump_file, ")^%ld\n", power);
+           }
+
+         reassociate_stats.pows_created++;
+
+         /* Visit each element of the vector in reverse order (so that
+            high-occurrence elements are visited first, and within the
+            same occurrence count, lower-ranked elements are visited
+            first).  Form a linear product of all elements in this order
+            whose occurrencce count is at least that of element J.
+            Record the SSA name representing the product of each element
+            with all subsequent elements in the vector.  */
+         if (j == vec_len - 1)
+           rf1->repr = rf1->factor;
+         else
+           {
+             for (ii = vec_len - 2; ii >= (int)j; ii--)
+               {
+                 tree op1, op2;
+
+                 rf1 = VEC_index (repeat_factor, repeat_factor_vec, ii);
+                 rf2 = VEC_index (repeat_factor, repeat_factor_vec, ii + 1);
+
+                 /* Init the last factor's representative to be itself.  */
+                 if (!rf2->repr)
+                   rf2->repr = rf2->factor;
+
+                 op1 = rf1->factor;
+                 op2 = rf2->repr;
+
+                 target_ssa = get_reassoc_pow_ssa_name (target, type);
+                 mul_stmt = gimple_build_assign_with_ops (MULT_EXPR,
+                                                          target_ssa,
+                                                          op1, op2);
+                 gimple_set_location (mul_stmt, gimple_location (stmt));
+                 gsi_insert_before (&gsi, mul_stmt, GSI_SAME_STMT);
+                 rf1->repr = target_ssa;
+
+                 /* Don't reprocess the multiply we just introduced.  */
+                 gimple_set_visited (mul_stmt, true);
+               }
+           }
+
+         /* Form a call to __builtin_powi for the maximum product
+            just formed, raised to the power obtained earlier.  */
+         rf1 = VEC_index (repeat_factor, repeat_factor_vec, j);
+         iter_result = get_reassoc_pow_ssa_name (target, type);
+         pow_stmt = gimple_build_call (powi_fndecl, 2, rf1->repr, 
+                                       build_int_cst (integer_type_node,
+                                                      power));
+         gimple_call_set_lhs (pow_stmt, iter_result);
+         gimple_set_location (pow_stmt, gimple_location (stmt));
+         gsi_insert_before (&gsi, pow_stmt, GSI_SAME_STMT);
+       }
+
+      /* Decrement the occurrence count of each element in the product
+        by the count found above, and remove this many copies of each
+        factor from OPS.  */
+      for (i = j; i < vec_len; i++)
+       {
+         unsigned k = power;
+         unsigned n;
+
+         rf1 = VEC_index (repeat_factor, repeat_factor_vec, i);
+         rf1->count -= power;
+         
+         FOR_EACH_VEC_ELT_REVERSE (operand_entry_t, *ops, n, oe)
+           {
+             if (oe->op == rf1->factor)
+               {
+                 if (oe->count <= k)
+                   {
+                     VEC_ordered_remove (operand_entry_t, *ops, n);
+                     k -= oe->count;
+
+                     if (k == 0)
+                       break;
+                   }
+                 else
+                   {
+                     oe->count -= k;
+                     break;
+                   }
+               }
+           }
+       }
+
+      /* If we previously formed at least one other builtin_powi call,
+        form the product of this one and those others.  */
+      if (result)
+       {
+         tree new_target_ssa = get_reassoc_pow_ssa_name (target, type);
+         mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, new_target_ssa,
+                                                  iter_result, result);
+         gimple_set_location (mul_stmt, gimple_location (stmt));
+         gsi_insert_before (&gsi, mul_stmt, GSI_SAME_STMT);
+         result = new_target_ssa;
+
+         /* Don't reprocess the multiply we just introduced.  */
+         gimple_set_visited (mul_stmt, true);
+       }
+      else
+       result = iter_result;
+    }
+
+  /* At this point all elements in the repeated factor vector have a
+     remaining occurrence count of 0 or 1, and those with a count of 1
+     don't have cached representatives.  Clean up.  */
+  VEC_free (repeat_factor, heap, repeat_factor_vec);
+
+  /* Return the final product as computed herein.  Note that there
+     may still be some elements with single occurrence count left
+     in OPS; those will be handled by the normal reassociation logic.  */
+  return result;
+}
+
 /* Reassociate expressions in basic block BB and its post-dominator as
    children.  */
 
@@ -2823,6 +3349,7 @@ reassociate_bb (basic_block bb)
 {
   gimple_stmt_iterator gsi;
   basic_block son;
+  tree target = NULL_TREE;
 
   for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
     {
@@ -2883,6 +3410,8 @@ reassociate_bb (basic_block bb)
 
          if (associative_tree_code (rhs_code))
            {
+             tree powi_result = NULL_TREE;
+
              VEC(operand_entry_t, heap) *ops = NULL;
 
              /* There may be no immediate uses left by the time we
@@ -2904,7 +3433,14 @@ reassociate_bb (basic_block bb)
              if (rhs_code == BIT_IOR_EXPR || rhs_code == BIT_AND_EXPR)
                optimize_range_tests (rhs_code, &ops);
 
-             if (VEC_length (operand_entry_t, ops) == 1)
+             if (early_reassoc
+                 && rhs_code == MULT_EXPR
+                 && flag_unsafe_math_optimizations)
+               powi_result = attempt_builtin_powi (stmt, &ops, &target);
+
+             /* If the operand vector is now empty, all operands were
+                consumed by the __builtin_pow optimization.  */
+             if (VEC_length (operand_entry_t, ops) == 0)
                {
                  if (dump_file && (dump_flags & TDF_DETAILS))
                    {
@@ -2913,11 +3449,11 @@ reassociate_bb (basic_block bb)
                    }
 
                  rhs1 = gimple_assign_rhs1 (stmt);
-                 gimple_assign_set_rhs_from_tree (&gsi,
-                                                  VEC_last (operand_entry_t,
-                                                            ops)->op);
+                 rhs2 = gimple_assign_rhs2 (stmt);
+                 gimple_assign_set_rhs_from_tree (&gsi, powi_result);
                  update_stmt (stmt);
                  remove_visited_stmt_chain (rhs1);
+                 remove_def_if_absorbed_call (rhs2);
 
                  if (dump_file && (dump_flags & TDF_DETAILS))
                    {
@@ -2925,6 +3461,41 @@ reassociate_bb (basic_block bb)
                      print_gimple_stmt (dump_file, stmt, 0, 0);
                    }
                }
+
+             else if (VEC_length (operand_entry_t, ops) == 1)
+               {
+                 tree last_op = VEC_last (operand_entry_t, ops)->op;
+                 if (dump_file && (dump_flags & TDF_DETAILS))
+                   {
+                     fprintf (dump_file, "Transforming ");
+                     print_gimple_stmt (dump_file, stmt, 0, 0);
+                   }
+
+                 if (powi_result)
+                   {
+                     rhs1 = gimple_assign_rhs1 (stmt);
+                     rhs2 = gimple_assign_rhs2 (stmt);
+                     gimple_assign_set_rhs_with_ops_1 (&gsi, MULT_EXPR,
+                                                       powi_result, last_op,
+                                                       NULL_TREE);
+                     update_stmt (gsi_stmt (gsi));
+                     remove_visited_stmt_chain (rhs1);
+                     remove_def_if_absorbed_call (rhs2);
+                   }
+                 else
+                   {
+                     rhs1 = gimple_assign_rhs1 (stmt);
+                     gimple_assign_set_rhs_from_tree (&gsi, last_op);
+                     update_stmt (stmt);
+                     remove_visited_stmt_chain (rhs1);
+                   }
+
+                 if (dump_file && (dump_flags & TDF_DETAILS))
+                   {
+                     fprintf (dump_file, " into ");
+                     print_gimple_stmt (dump_file, stmt, 0, 0);
+                   }
+               }
              else
                {
                  enum machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
@@ -2940,6 +3511,25 @@ reassociate_bb (basic_block bb)
                    rewrite_expr_tree_parallel (stmt, width, ops);
                  else
                    rewrite_expr_tree (stmt, 0, ops, false);
+
+                 /* If we combined some repeated factors into a 
+                    __builtin_powi call, multiply that result by the
+                    reassociated operands.  */
+                 if (powi_result)
+                   {
+                     gimple mul_stmt;
+                     tree type = TREE_TYPE (gimple_get_lhs (stmt));
+                     tree target_ssa = get_reassoc_pow_ssa_name (&target,
+                                                                 type);
+                     gimple_set_lhs (stmt, target_ssa);
+                     update_stmt (stmt);
+                     mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, lhs,
+                                                              powi_result,
+                                                              target_ssa);
+                     gimple_set_location (mul_stmt, gimple_location (stmt));
+                     gsi_insert_after (&gsi, mul_stmt, GSI_NEW_STMT);
+                     gimple_set_visited (mul_stmt, true);
+                   }
                }
 
              VEC_free (operand_entry_t, heap, ops);
@@ -3054,6 +3644,10 @@ fini_reassoc (void)
                            reassociate_stats.ops_eliminated);
   statistics_counter_event (cfun, "Statements rewritten",
                            reassociate_stats.rewritten);
+  statistics_counter_event (cfun, "Built-in pow[i] calls encountered",
+                           reassociate_stats.pows_encountered);
+  statistics_counter_event (cfun, "Built-in powi calls created",
+                           reassociate_stats.pows_created);
 
   pointer_map_destroy (operand_rank);
   free_alloc_pool (operand_entry_pool);
@@ -3077,19 +3671,33 @@ execute_reassoc (void)
   return 0;
 }
 
+static unsigned int
+execute_early_reassoc (void)
+{
+  early_reassoc = true;
+  return execute_reassoc ();
+}
+
+static unsigned int
+execute_late_reassoc (void)
+{
+  early_reassoc = false;
+  return execute_reassoc ();
+}
+
 static bool
 gate_tree_ssa_reassoc (void)
 {
   return flag_tree_reassoc != 0;
 }
 
-struct gimple_opt_pass pass_reassoc =
+struct gimple_opt_pass pass_early_reassoc =
 {
  {
   GIMPLE_PASS,
-  "reassoc",                           /* name */
+  "reassoc1",                          /* name */
   gate_tree_ssa_reassoc,               /* gate */
-  execute_reassoc,                     /* execute */
+  execute_early_reassoc,               /* execute */
   NULL,                                        /* sub */
   NULL,                                        /* next */
   0,                                   /* static_pass_number */
@@ -3103,3 +3711,24 @@ gate_tree_ssa_reassoc (void)
     | TODO_ggc_collect                 /* todo_flags_finish */
  }
 };
+
+struct gimple_opt_pass pass_late_reassoc =
+{
+ {
+  GIMPLE_PASS,
+  "reassoc2",                          /* name */
+  gate_tree_ssa_reassoc,               /* gate */
+  execute_late_reassoc,                        /* execute */
+  NULL,                                        /* sub */
+  NULL,                                        /* next */
+  0,                                   /* static_pass_number */
+  TV_TREE_REASSOC,                     /* tv_id */
+  PROP_cfg | PROP_ssa,                 /* properties_required */
+  0,                                   /* properties_provided */
+  0,                                   /* properties_destroyed */
+  0,                                   /* todo_flags_start */
+  TODO_verify_ssa
+    | TODO_verify_flow
+    | TODO_ggc_collect                 /* todo_flags_finish */
+ }
+};


Reply via email to