I noticed the loop unroller peels an extra copy of the loop before it enters 
the switch block code to round the iteration count to a multiple of the unroll 
factor. This peeled copy is only needed for the case where the exit test is at 
the beginning of the loop since in that case it inserts the test for zero peel 
iterations before that peeled copy.

This patch bumps the iteration count by 1 for loops with the exit at the end so 
that it represents the number of times the loop body is executed, and therefore 
removes the need to always execute that first peeled copy. With this change, 
when the number of executions of the loop is an even multiple of the unroll 
factor then the code will jump to the unrolled loop immediately instead of 
executing all the switch code and peeled copies of the loop and then falling 
into the unrolled loop. This change also reduces code size by removing a peeled 
copy of the loop. 

Bootstrap/regtest on powerpc64le with no new regressions. Ok for trunk?



2016-09-22  Pat Haugen  <pthau...@us.ibm.com>

        * loop-unroll.c (unroll_loop_runtime_iterations): Condition initial
        loop peel to loops with exit test at the beginning.


Index: gcc/loop-unroll.c
===================================================================
--- gcc/loop-unroll.c	(revision 240167)
+++ gcc/loop-unroll.c	(working copy)
@@ -857,7 +857,7 @@ unroll_loop_runtime_iterations (struct l
   rtx old_niter, niter, tmp;
   rtx_insn *init_code, *branch_code;
   unsigned i, j, p;
-  basic_block preheader, *body, swtch, ezc_swtch;
+  basic_block preheader, *body, swtch, ezc_swtch = NULL;
   int may_exit_copy;
   unsigned n_peel;
   edge e;
@@ -916,6 +916,16 @@ unroll_loop_runtime_iterations (struct l
   if (tmp != niter)
     emit_move_insn (niter, tmp);
 
+  /* For loops that exit at end, add one to niter to account for first pass
+     through loop body before reaching exit test. */
+  if (exit_at_end)
+    {
+      niter = expand_simple_binop (desc->mode, PLUS,
+				   niter, const1_rtx,
+				   NULL_RTX, 0, OPTAB_LIB_WIDEN);
+      old_niter = niter;
+    }
+
   /* Count modulo by ANDing it with max_unroll; we use the fact that
      the number of unrollings is a power of two, and thus this is correct
      even if there is overflow in the computation.  */
@@ -934,20 +944,21 @@ unroll_loop_runtime_iterations (struct l
 
   auto_sbitmap wont_exit (max_unroll + 2);
 
-  /* Peel the first copy of loop body (almost always we must leave exit test
-     here; the only exception is when we have extra zero check and the number
-     of iterations is reliable.  Also record the place of (possible) extra
-     zero check.  */
-  bitmap_clear (wont_exit);
-  if (extra_zero_check
-      && !desc->noloop_assumptions)
-    bitmap_set_bit (wont_exit, 1);
-  ezc_swtch = loop_preheader_edge (loop)->src;
-  ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
-				      1, wont_exit, desc->out_edge,
-				      &remove_edges,
-				      DLTHE_FLAG_UPDATE_FREQ);
-  gcc_assert (ok);
+  if (extra_zero_check)
+    {
+      /* Peel the first copy of loop body.  Leave the exit test if the number
+	 of iterations is not reliable.  Also record the place of the extra zero
+	 check.  */
+      bitmap_clear (wont_exit);
+      if (!desc->noloop_assumptions)
+	bitmap_set_bit (wont_exit, 1);
+      ezc_swtch = loop_preheader_edge (loop)->src;
+      ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
+					  1, wont_exit, desc->out_edge,
+					  &remove_edges,
+					  DLTHE_FLAG_UPDATE_FREQ);
+      gcc_assert (ok);
+    }
 
   /* Record the place where switch will be built for preconditioning.  */
   swtch = split_edge (loop_preheader_edge (loop));

Reply via email to