This patch adds heuristics to limit unrolling in loops with branches that may increase branch mispredictions. It affects loops that are not frequently iterated, and that are nested within a hot region of code that already contains many branch instructions.
Performance tested with both internal benchmarks and with SPEC 2000/2006 on a variety of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD Opteron systems. This improves performance of an internal search indexing benchmark by close to 2% on all the tested Intel platforms. It also consistently improves 445.gobmk (with FDO feedback where unrolling kicks in) by close to 1% on AMD Opteron. Other performance effects are neutral. Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk? Thanks, Teresa 2012-04-24 Teresa Johnson <tejohn...@google.com> * loop-unroll.c (loop_has_call): New function. (loop_has_FP_comp): Ditto. (compute_weighted_branches): Ditto. (max_unroll_with_branches): Ditto. (decide_unroll_constant_iterations): Add heuristic to avoid increasing branch mispredicts when unrolling. (decide_unroll_runtime_iterations): Ditto. * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param. (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto. Index: loop-unroll.c =================================================================== --- loop-unroll.c (revision 186783) +++ loop-unroll.c (working copy) @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc basic_block); static rtx get_expansion (struct var_to_expand *); +/* Determine whether LOOP contains call. */ +static bool +loop_has_call(struct loop *loop) +{ + basic_block *body, bb; + unsigned i; + rtx insn; + + body = get_loop_body (loop); + for (i = 0; i < loop->num_nodes; i++) + { + bb = body[i]; + + FOR_BB_INSNS (bb, insn) + { + if (CALL_P (insn)) + { + free (body); + return true; + } + } + } + free (body); + return false; +} + +/* Determine whether LOOP contains floating-point computation. */ +static bool +loop_has_FP_comp(struct loop *loop) +{ + rtx set, dest; + basic_block *body, bb; + unsigned i; + rtx insn; + + body = get_loop_body (loop); + for (i = 0; i < loop->num_nodes; i++) + { + bb = body[i]; + + FOR_BB_INSNS (bb, insn) + { + set = single_set (insn); + if (!set) + continue; + + dest = SET_DEST (set); + if (FLOAT_MODE_P (GET_MODE (dest))) + { + free (body); + return true; + } + } + } + free (body); + return false; +} + +/* Compute the number of branches in LOOP, weighted by execution counts. */ +static float +compute_weighted_branches(struct loop *loop) +{ + int header_count = loop->header->count; + unsigned i; + float n; + basic_block * body; + + /* If no profile feedback data exists, don't limit unrolling */ + if (header_count == 0) + return 0.0; + + gcc_assert (loop->latch != EXIT_BLOCK_PTR); + + body = get_loop_body (loop); + n = 0.0; + for (i = 0; i < loop->num_nodes; i++) + { + if (EDGE_COUNT (body[i]->succs) >= 2) + { + /* If this block is executed less frequently than the header (loop + entry), then it is weighted based on the ratio of times it is + executed compared to the header. */ + if (body[i]->count < header_count) + n += ((float)body[i]->count)/header_count; + + /* When it is executed more frequently than the header (i.e. it is + in a nested inner loop), simply weight the branch at 1.0. */ + else + n += 1.0; + } + } + free (body); + + return n; +} + +/* Compute the maximum number of times LOOP can be unrolled without exceeding + a branch budget, which can increase branch mispredictions. The number of + branches is computed by weighting each branch with its expected execution + probability through the loop based on profile data. If no profile feedback + data exists, simply return the current NUNROLL factor. */ +static unsigned +max_unroll_with_branches(struct loop *loop, unsigned nunroll) +{ + struct loop *outer; + struct niter_desc *outer_desc; + int outer_niters = 1; + float weighted_outer_branches = 0.0; + float weighted_num_branches = compute_weighted_branches (loop); + + /* If there was no profile feedback data, weighted_num_branches will be 0.0 + and we won't limit unrolling. If the weighted_num_branches is at most 1.0, + also don't limit unrolling as the back-edge branch will not be duplicated. */ + if (weighted_num_branches <= 1.0) + return nunroll; + + /* Walk up the loop tree until we find a hot outer loop in which the current + loop is nested. At that point we will compute the number of times the + current loop can be unrolled based on the number of branches in the hot + outer loop. */ + outer = loop_outer(loop); + /* The loop structure contains a fake outermost loop, so this should always + be non-NULL for our current loop. */ + gcc_assert (outer); + /* Detect if this is the fake outermost loop (at which point we are done) + by checking its outer loop. */ + while (loop_outer(outer)) + { + outer_desc = get_simple_loop_desc (outer); + + if (outer_desc->const_iter) + outer_niters *= outer_desc->niter; + else if (outer->header->count) + outer_niters *= expected_loop_iterations (outer); + + weighted_outer_branches = compute_weighted_branches (outer); + + /* Should have been checked by caller. */ + gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1); + + /* If the outer loop has enough iterations to be considered hot, then + we can stop our upwards loop tree traversal and examine the current + outer loop. */ + if (outer_niters >= PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) + { + /* Assume that any call will cause the branch budget to be exceeded, + and that we can't unroll the current loop without increasing + mispredicts. */ + if (loop_has_call(outer)) + return 0; + + /* Otherwise, compute the maximum number of times current loop can be + unrolled without exceeding our branch budget. First we subtract + off the outer loop's weighted branch count from the budget. Note + that this includes the branches in the current loop. This yields + the number of branches left in the budget for the unrolled copies. + We divide this by the number of branches in the current loop that + must be duplicated when we unroll, which is the total weighted + number of branches minus the back-edge branch. This yields the + number of new loop body copies that can be created by unrolling + without exceeding the budget, to which we add 1 to get the unroll + factor. */ + return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) - + weighted_outer_branches)/(weighted_num_branches - 1) + 1; + } + outer = loop_outer(outer); + } + + /* The current loop is not enclosed by a hot enough outer loop in this + procedure, since the hot outer loop is inter-procedural, assume that + it already contains a significant number of branches, so don't unroll. */ + return 0; +} + /* Unroll and/or peel (depending on FLAGS) LOOPS. */ void unroll_and_peel_loops (int flags) @@ -522,6 +696,7 @@ static void decide_unroll_constant_iterations (struct loop *loop, int flags) { unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i; + unsigned nunroll_branches; struct niter_desc *desc; if (!(flags & UAP_UNROLL)) @@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo return; } + /* Be careful when unrolling loops with branches inside -- it can increase + the number of mispredicts. Ignore loops with FP computation as these + tend to benefit much more consistently from unrolling. */ + if (num_loop_branches (loop) > 1 + && loop_has_FP_comp(loop) + && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1 + && desc->niter < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) + { + nunroll_branches = max_unroll_with_branches(loop, nunroll); + if (nunroll > nunroll_branches) + nunroll = nunroll_branches; + if (nunroll <= 1) + { + if (dump_file) + fprintf (dump_file, ";; Not unrolling, contains branches\n"); + return; + } + } + /* Check whether the loop rolls enough to consider. */ if (desc->niter < 2 * nunroll) { @@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop static void decide_unroll_runtime_iterations (struct loop *loop, int flags) { - unsigned nunroll, nunroll_by_av, i; + unsigned nunroll, nunroll_by_av, nunroll_branches, i; struct niter_desc *desc; if (!(flags & UAP_UNROLL)) @@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo return; } + /* Be careful when unrolling loops with branches inside -- it can increase + the number of mispredicts. Ignore loops with FP computation as these + tend to benefit much more consistently from unrolling. */ + if (num_loop_branches (loop) > 1 + && loop_has_FP_comp(loop) + && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1 + && expected_loop_iterations (loop) < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) + { + nunroll_branches = max_unroll_with_branches(loop, nunroll); + if (nunroll > nunroll_branches) + nunroll = nunroll_branches; + if (nunroll <= 1) + { + if (dump_file) + fprintf (dump_file, ";; Not unrolling, contains branches\n"); + return; + } + } + /* If we have profile feedback, check whether the loop rolls. */ if ((loop->header->count && expected_loop_iterations (loop) < 2 * nunroll) Index: params.def =================================================================== --- params.def (revision 186783) +++ params.def (working copy) @@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS, "The maximum depth of a loop nest we completely peel", 8, 0, 0) +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES, + "min-iter-unroll-with-branches", + "Minimum iteration count to ignore branch effects when unrolling", + 50, 0, 0) + +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET, + "unroll-outer-loop-branch-budget", + "Maximum number of branches allowed in hot outer loop region after unroll", + 25, 0, 0) + /* The maximum number of insns of an unswitched loop. */ DEFPARAM(PARAM_MAX_UNSWITCH_INSNS, "max-unswitch-insns", -- This patch is available for review at http://codereview.appspot.com/6099055