Hi,

  I created a bug for this issue: 
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94269 
  Looks like widening_mul phase may move multiply instruction from outside the 
loop to inside the loop, merging with one add instruction inside the loop.  
  This will increase the cost of the loop at least on aarch64 (4 cycles vs 1 
cycle).  I think widening_mul should consider block frequency when doing such a 
combination.  
  I mean something like:
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 54ba035..4439452 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -2721,7 +2721,10 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi, 
gimple *stmt,
     {
       if (!has_single_use (rhs1)
          || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
-                                 &type2, &mult_rhs2))
+                                 &type2, &mult_rhs2)
+         || (gimple_bb (rhs1_stmt) != gimple_bb (stmt)
+             && gimple_bb (rhs1_stmt)->count.to_frequency(cfun)
+                < gimple_bb (stmt)->count.to_frequency(cfun)))
        return false;
       add_rhs = rhs2;
       conv_stmt = conv1_stmt;
@@ -2730,7 +2733,10 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi, 
gimple *stmt,
     {
       if (!has_single_use (rhs2)
          || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
-                                 &type2, &mult_rhs2))
+                                 &type2, &mult_rhs2)
+         || (gimple_bb (rhs2_stmt) != gimple_bb (stmt)
+             && gimple_bb (rhs2_stmt)->count.to_frequency(cfun)
+                < gimple_bb (stmt)->count.to_frequency(cfun)))
        return false;
       add_rhs = rhs1;
       conv_stmt = conv2_stmt;

  Comments?

Thanks,
Felix

Reply via email to