The following avoids heap allocations for setting up the loop iterator in most cases. It also avoids re-allocating the PHI vector all the time in mark_phi_for_rewrite. Plus it fixes some of the slowness we observe in compute_idf which uses a) bad iteration order and b) a worklist implementation not avoiding duplicates. I see up to 30% duplicate blocks on the work vector plus we seed the work vector in a way that later blocks are visited first (but only ever later blocks are queued so first first is more efficient).
The vec.h hunk is to aovid a bootstrap fail due to a -Wfree-nonheap-object warning from the loop iterator change. IIRC we saw such thing in the past and Honza also mentioned this recently. Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. Richard. 2019-11-21 Richard Biener <rguent...@suse.de> * cfgloop.h (loop_iterator::~loop_iterator): Remove. (loop_iterator::to_visit): Use an auto_vec with internal storage. (loop_iterator::loop_iterator): Adjust. * cfganal.c (compute_dominance_frontiers_1): Fold into... (compute_dominance_frontiers): ... this. Hoist invariant get_immediate_dominator call. (compute_idf): Use a work-set instead of a work-list for more optimal iteration order and duplicate avoidance. * tree-into-ssa.c (mark_phi_for_rewrite): Avoid re-allocating the vector all the time, instead pre-allocate the vector only once. (delete_update_ssa): Simplify. * vec.h (va_heap::release): Disable -Wfree-nonheap-object around it. Index: gcc/cfgloop.h =================================================================== --- gcc/cfgloop.h (revision 278496) +++ gcc/cfgloop.h (working copy) @@ -661,7 +661,6 @@ class loop_iterator { public: loop_iterator (function *fn, loop_p *loop, unsigned flags); - ~loop_iterator (); inline loop_p next (); @@ -669,7 +668,7 @@ public: function *fn; /* The list of loops to visit. */ - vec<int> to_visit; + auto_vec<int, 16> to_visit; /* The index of the actual loop. */ unsigned idx; @@ -702,12 +701,11 @@ loop_iterator::loop_iterator (function * this->fn = fn; if (!loops_for_fn (fn)) { - this->to_visit.create (0); *loop = NULL; return; } - this->to_visit.create (number_of_loops (fn)); + this->to_visit.reserve_exact (number_of_loops (fn)); mn = (flags & LI_INCLUDE_ROOT) ? 0 : 1; if (flags & LI_ONLY_INNERMOST) @@ -769,12 +767,6 @@ loop_iterator::loop_iterator (function * *loop = this->next (); } -inline -loop_iterator::~loop_iterator () -{ - this->to_visit.release (); -} - #define FOR_EACH_LOOP(LOOP, FLAGS) \ for (loop_iterator li(cfun, &(LOOP), FLAGS); \ (LOOP); \ Index: gcc/cfganal.c =================================================================== --- gcc/cfganal.c (revision 278496) +++ gcc/cfganal.c (working copy) @@ -1326,10 +1327,11 @@ dfs_enumerate_from (basic_block bb, int of the dominance frontiers, no more, no less. */ - -static void -compute_dominance_frontiers_1 (bitmap_head *frontiers) +void +compute_dominance_frontiers (bitmap_head *frontiers) { + timevar_push (TV_DOM_FRONTIERS); + edge p; edge_iterator ei; basic_block b; @@ -1337,34 +1339,22 @@ compute_dominance_frontiers_1 (bitmap_he { if (EDGE_COUNT (b->preds) >= 2) { + basic_block domsb = get_immediate_dominator (CDI_DOMINATORS, b); FOR_EACH_EDGE (p, ei, b->preds) { basic_block runner = p->src; - basic_block domsb; if (runner == ENTRY_BLOCK_PTR_FOR_FN (cfun)) continue; - domsb = get_immediate_dominator (CDI_DOMINATORS, b); while (runner != domsb) { - if (!bitmap_set_bit (&frontiers[runner->index], - b->index)) + if (!bitmap_set_bit (&frontiers[runner->index], b->index)) break; - runner = get_immediate_dominator (CDI_DOMINATORS, - runner); + runner = get_immediate_dominator (CDI_DOMINATORS, runner); } } } } -} - - -void -compute_dominance_frontiers (bitmap_head *frontiers) -{ - timevar_push (TV_DOM_FRONTIERS); - - compute_dominance_frontiers_1 (frontiers); timevar_pop (TV_DOM_FRONTIERS); } @@ -1385,25 +1375,26 @@ compute_idf (bitmap def_blocks, bitmap_h unsigned bb_index, i; bitmap phi_insertion_points; - /* Each block can appear at most twice on the work-stack. */ - auto_vec<int> work_stack (2 * n_basic_blocks_for_fn (cfun)); phi_insertion_points = BITMAP_ALLOC (NULL); - /* Seed the work list with all the blocks in DEF_BLOCKS. We use - vec::quick_push here for speed. This is safe because we know that - the number of definition blocks is no greater than the number of - basic blocks, which is the initial capacity of WORK_STACK. */ - EXECUTE_IF_SET_IN_BITMAP (def_blocks, 0, bb_index, bi) - work_stack.quick_push (bb_index); + /* Seed the work set with all the blocks in DEF_BLOCKS. */ + auto_bitmap work_set; + bitmap_copy (work_set, def_blocks); + bitmap_tree_view (work_set); - /* Pop a block off the worklist, add every block that appears in + /* Pop a block off the workset, add every block that appears in the original block's DF that we have not already processed to - the worklist. Iterate until the worklist is empty. Blocks - which are added to the worklist are potential sites for + the workset. Iterate until the workset is empty. Blocks + which are added to the workset are potential sites for PHI nodes. */ - while (work_stack.length () > 0) + while (!bitmap_empty_p (work_set)) { - bb_index = work_stack.pop (); + /* The dominance frontier of a block is blocks after it so iterating + on earlier blocks first is better. + ??? Basic blocks are by no means guaranteed to be ordered in + optimal order for this iteration. */ + bb_index = bitmap_first_set_bit (work_set); + bitmap_clear_bit (work_set, bb_index); /* Since the registration of NEW -> OLD name mappings is done separately from the call to update_ssa, when updating the SSA @@ -1416,7 +1407,7 @@ compute_idf (bitmap def_blocks, bitmap_h EXECUTE_IF_AND_COMPL_IN_BITMAP (&dfs[bb_index], phi_insertion_points, 0, i, bi) { - work_stack.quick_push (i); + bitmap_set_bit (work_set, i); bitmap_set_bit (phi_insertion_points, i); } } Index: gcc/tree-into-ssa.c =================================================================== --- gcc/tree-into-ssa.c (revision 278496) +++ gcc/tree-into-ssa.c (working copy) @@ -940,14 +940,18 @@ mark_phi_for_rewrite (basic_block bb, gp if (!blocks_with_phis_to_rewrite) return; - bitmap_set_bit (blocks_with_phis_to_rewrite, idx); - - n = (unsigned) last_basic_block_for_fn (cfun) + 1; - if (phis_to_rewrite.length () < n) - phis_to_rewrite.safe_grow_cleared (n); - - phis = phis_to_rewrite[idx]; - phis.reserve (10); + if (bitmap_set_bit (blocks_with_phis_to_rewrite, idx)) + { + n = (unsigned) last_basic_block_for_fn (cfun) + 1; + if (phis_to_rewrite.length () < n) + phis_to_rewrite.safe_grow_cleared (n); + + phis = phis_to_rewrite[idx]; + gcc_assert (!phis.exists ()); + phis.create (10); + } + else + phis = phis_to_rewrite[idx]; phis.safe_push (phi); phis_to_rewrite[idx] = phis; @@ -2937,11 +2941,7 @@ delete_update_ssa (void) if (blocks_with_phis_to_rewrite) EXECUTE_IF_SET_IN_BITMAP (blocks_with_phis_to_rewrite, 0, i, bi) - { - vec<gphi *> phis = phis_to_rewrite[i]; - phis.release (); - phis_to_rewrite[i].create (0); - } + phis_to_rewrite[i].release (); BITMAP_FREE (blocks_with_phis_to_rewrite); BITMAP_FREE (blocks_to_update); Index: gcc/vec.h =================================================================== --- gcc/vec.h (revision 278548) +++ gcc/vec.h (working copy) @@ -295,6 +295,11 @@ va_heap::reserve (vec<T, va_heap, vl_emb } +#if GCC_VERSION >= 4007 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wfree-nonheap-object" +#endif + /* Free the heap space allocated for vector V. */ template<typename T> @@ -312,6 +317,9 @@ va_heap::release (vec<T, va_heap, vl_emb v = NULL; } +#if GCC_VERSION >= 4007 +#pragma GCC diagnostic pop +#endif /* Allocator type for GC vectors. Notice that we need the structure declaration even if GC is not enabled. */