[PATCH] Take branch misprediction effects into account when RTL loop unrolling (issue6099055)

Teresa Johnson Tue, 24 Apr 2012 14:27:08 -0700

This patch adds heuristics to limit unrolling in loops with branches that may 
increase
branch mispredictions. It affects loops that are not frequently iterated, and 
that are
nested within a hot region of code that already contains many branch 
instructions.


Performance tested with both internal benchmarks and with SPEC 2000/2006 on a 
variety
of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD 
Opteron systems.
This improves performance of an internal search indexing benchmark by close to 
2% on
all the tested Intel platforms.  It also consistently improves 445.gobmk (with 
FDO feedback
where unrolling kicks in) by close to 1% on AMD Opteron. Other performance 
effects are
neutral.

Bootstrapped and tested on x86_64-unknown-linux-gnu.  Is this ok for trunk?

Thanks,
Teresa

2012-04-24   Teresa Johnson  <tejohn...@google.com>

        * loop-unroll.c (loop_has_call): New function.
        (loop_has_FP_comp): Ditto.
        (compute_weighted_branches): Ditto.
        (max_unroll_with_branches): Ditto.
        (decide_unroll_constant_iterations): Add heuristic to avoid
        increasing branch mispredicts when unrolling.
        (decide_unroll_runtime_iterations): Ditto.
        * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
        (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.

Index: loop-unroll.c
===================================================================
--- loop-unroll.c       (revision 186783)
+++ loop-unroll.c       (working copy)
@@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc
                                             basic_block);
 static rtx get_expansion (struct var_to_expand *);
 
+/* Determine whether LOOP contains call.  */
+static bool
+loop_has_call(struct loop *loop)
+{
+  basic_block *body, bb;
+  unsigned i;
+  rtx insn;
+
+  body = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      bb = body[i];
+
+      FOR_BB_INSNS (bb, insn)
+        {
+          if (CALL_P (insn))
+            {
+              free (body);
+              return true;
+            }
+        }
+    }
+  free (body);
+  return false;
+}
+
+/* Determine whether LOOP contains floating-point computation.  */
+static bool
+loop_has_FP_comp(struct loop *loop)
+{
+  rtx set, dest;
+  basic_block *body, bb;
+  unsigned i;
+  rtx insn;
+
+  body = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      bb = body[i];
+
+      FOR_BB_INSNS (bb, insn)
+        {
+          set = single_set (insn);
+          if (!set)
+            continue;
+
+          dest = SET_DEST (set);
+          if (FLOAT_MODE_P (GET_MODE (dest)))
+            {
+              free (body);
+              return true;
+            }
+        }
+    }
+  free (body);
+  return false;
+}
+
+/* Compute the number of branches in LOOP, weighted by execution counts.  */
+static float
+compute_weighted_branches(struct loop *loop)
+{
+  int header_count = loop->header->count;
+  unsigned i;
+  float n;
+  basic_block * body;
+
+  /* If no profile feedback data exists, don't limit unrolling  */
+  if (header_count == 0)
+    return 0.0;
+
+  gcc_assert (loop->latch != EXIT_BLOCK_PTR);
+
+  body = get_loop_body (loop);
+  n = 0.0;
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      if (EDGE_COUNT (body[i]->succs) >= 2)
+        {
+          /* If this block is executed less frequently than the header (loop
+             entry), then it is weighted based on the ratio of times it is
+             executed compared to the header.  */
+          if (body[i]->count < header_count)
+            n += ((float)body[i]->count)/header_count;
+
+          /* When it is executed more frequently than the header (i.e. it is
+             in a nested inner loop), simply weight the branch at 1.0.  */
+          else
+            n += 1.0;
+        }
+    }
+  free (body);
+
+  return n;
+}
+
+/* Compute the maximum number of times LOOP can be unrolled without exceeding
+   a branch budget, which can increase branch mispredictions. The number of
+   branches is computed by weighting each branch with its expected execution
+   probability through the loop based on profile data. If no profile feedback
+   data exists, simply return the current NUNROLL factor.  */
+static unsigned
+max_unroll_with_branches(struct loop *loop, unsigned nunroll)
+{
+  struct loop *outer;
+  struct niter_desc *outer_desc;
+  int outer_niters = 1;
+  float weighted_outer_branches = 0.0;
+  float weighted_num_branches = compute_weighted_branches (loop);
+
+  /* If there was no profile feedback data, weighted_num_branches will be 0.0
+     and we won't limit unrolling. If the weighted_num_branches is at most 1.0,
+     also don't limit unrolling as the back-edge branch will not be 
duplicated.  */
+  if (weighted_num_branches <= 1.0)
+    return nunroll;
+
+  /* Walk up the loop tree until we find a hot outer loop in which the current
+     loop is nested. At that point we will compute the number of times the
+     current loop can be unrolled based on the number of branches in the hot
+     outer loop.  */
+  outer = loop_outer(loop);
+  /* The loop structure contains a fake outermost loop, so this should always
+     be non-NULL for our current loop.  */
+  gcc_assert (outer);
+  /* Detect if this is the fake outermost loop (at which point we are done)
+     by checking its outer loop.  */
+  while (loop_outer(outer))
+    {
+      outer_desc = get_simple_loop_desc (outer);
+
+      if (outer_desc->const_iter)
+        outer_niters *= outer_desc->niter;
+      else if (outer->header->count)
+        outer_niters *= expected_loop_iterations (outer);
+
+      weighted_outer_branches = compute_weighted_branches (outer);
+
+      /* Should have been checked by caller.  */
+      gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1);
+
+      /* If the outer loop has enough iterations to be considered hot, then
+         we can stop our upwards loop tree traversal and examine the current
+         outer loop.  */
+      if (outer_niters >= PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
+        {
+          /* Assume that any call will cause the branch budget to be exceeded,
+             and that we can't unroll the current loop without increasing
+             mispredicts.  */
+          if (loop_has_call(outer))
+            return 0;
+
+          /* Otherwise, compute the maximum number of times current loop can be
+             unrolled without exceeding our branch budget. First we subtract
+             off the outer loop's weighted branch count from the budget. Note
+             that this includes the branches in the current loop. This yields
+             the number of branches left in the budget for the unrolled copies.
+             We divide this by the number of branches in the current loop that
+             must be duplicated when we unroll, which is the total weighted
+             number of branches minus the back-edge branch. This yields the
+             number of new loop body copies that can be created by unrolling
+             without exceeding the budget, to which we add 1 to get the unroll
+             factor.  */
+          return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) -
+                  weighted_outer_branches)/(weighted_num_branches - 1) + 1;
+        }
+      outer = loop_outer(outer);
+    }
+
+  /* The current loop is not enclosed by a hot enough outer loop in this
+     procedure, since the hot outer loop is inter-procedural, assume that
+     it already contains a significant number of branches, so don't unroll.  */
+  return 0;
+}
+
 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 void
 unroll_and_peel_loops (int flags)
@@ -522,6 +696,7 @@ static void
 decide_unroll_constant_iterations (struct loop *loop, int flags)
 {
   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
+  unsigned nunroll_branches;
   struct niter_desc *desc;
 
   if (!(flags & UAP_UNROLL))
@@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo
       return;
     }
 
+  /* Be careful when unrolling loops with branches inside -- it can increase
+     the number of mispredicts. Ignore loops with FP computation as these
+     tend to benefit much more consistently from unrolling.  */
+  if (num_loop_branches (loop) > 1
+      && loop_has_FP_comp(loop)
+      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
+      && desc->niter < (unsigned) PARAM_VALUE 
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
+    {
+      nunroll_branches = max_unroll_with_branches(loop, nunroll);
+      if (nunroll > nunroll_branches)
+        nunroll = nunroll_branches;
+      if (nunroll <= 1)
+        {
+          if (dump_file)
+           fprintf (dump_file, ";; Not unrolling, contains branches\n");
+          return;
+        }
+    }
+
   /* Check whether the loop rolls enough to consider.  */
   if (desc->niter < 2 * nunroll)
     {
@@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop
 static void
 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 {
-  unsigned nunroll, nunroll_by_av, i;
+  unsigned nunroll, nunroll_by_av, nunroll_branches, i;
   struct niter_desc *desc;
 
   if (!(flags & UAP_UNROLL))
@@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo
       return;
     }
 
+  /* Be careful when unrolling loops with branches inside -- it can increase
+     the number of mispredicts. Ignore loops with FP computation as these
+     tend to benefit much more consistently from unrolling.  */
+  if (num_loop_branches (loop) > 1
+      && loop_has_FP_comp(loop)
+      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
+      && expected_loop_iterations (loop) < (unsigned) PARAM_VALUE 
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
+    {
+      nunroll_branches = max_unroll_with_branches(loop, nunroll);
+      if (nunroll > nunroll_branches)
+        nunroll = nunroll_branches;
+      if (nunroll <= 1)
+        {
+          if (dump_file)
+            fprintf (dump_file, ";; Not unrolling, contains branches\n");
+          return;
+        }
+    }
+
   /* If we have profile feedback, check whether the loop rolls.  */
   if ((loop->header->count
        && expected_loop_iterations (loop) < 2 * nunroll)
Index: params.def
===================================================================
--- params.def  (revision 186783)
+++ params.def  (working copy)
@@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
         "The maximum depth of a loop nest we completely peel",
         8, 0, 0)
 
+DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
+       "min-iter-unroll-with-branches",
+       "Minimum iteration count to ignore branch effects when unrolling",
+       50, 0, 0)
+
+DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
+       "unroll-outer-loop-branch-budget",
+       "Maximum number of branches allowed in hot outer loop region after 
unroll",
+       25, 0, 0)
+
 /* The maximum number of insns of an unswitched loop.  */
 DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
        "max-unswitch-insns",

--
This patch is available for review at http://codereview.appspot.com/6099055

[PATCH] Take branch misprediction effects into account when RTL loop unrolling (issue6099055)

Reply via email to