The compile time issue was discovered in SPEC 2017 wrf: Use time and -ftime-report to analyze the profile data of SPEC 2017 wrf compilation .
Before this patch (Lazy vsetvl): scheduling : 121.89 ( 15%) 0.53 ( 11%) 122.72 ( 15%) 13M ( 1%) machine dep reorg : 424.61 ( 53%) 1.84 ( 37%) 427.44 ( 53%) 5290k ( 0%) real 13m27.074s user 13m19.539s sys 0m5.180s Simple vsetvl: machine dep reorg : 0.10 ( 0%) 0.00 ( 0%) 0.11 ( 0%) 4138k ( 0%) real 6m5.780s user 6m2.396s sys 0m2.373s The machine dep reorg is the compile time of VSETVL PASS (424 seconds) which counts 53% of the compilation time, spends much more time than scheduling. After investigation, the critical patch of VSETVL pass is compute_lcm_local_properties which is called every iteration of phase 2 (earliest fusion) and phase 3 (global lcm). This patch optimized the codes of compute_lcm_local_properties to reduce the compilation time. After this patch: scheduling : 117.51 ( 27%) 0.21 ( 6%) 118.04 ( 27%) 13M ( 1%) machine dep reorg : 80.13 ( 18%) 0.91 ( 26%) 81.26 ( 18%) 5290k ( 0%) real 7m25.374s user 7m20.116s sys 0m3.795s The optimization of this patch is very obvious, lazy VSETVL PASS: 424s (53%) -> 80s (18%) which spend less time than scheduling. Tested on both RV32 and RV64 no regression. Ok for trunk ? PR target/113495 gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (extract_single_source): Remove. (pre_vsetvl::compute_vsetvl_def_data): Fix compile time issue. (pre_vsetvl::compute_transparent): New function. (pre_vsetvl::compute_lcm_local_properties): Fix compile time time issue. --- gcc/config/riscv/riscv-vsetvl.cc | 184 ++++++++++--------------------- 1 file changed, 60 insertions(+), 124 deletions(-) diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index d7b40a5c813..cec862329c5 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -599,14 +599,6 @@ extract_single_source (set_info *set) return first_insn; } -static insn_info * -extract_single_source (def_info *def) -{ - if (!def) - return nullptr; - return extract_single_source (dyn_cast<set_info *> (def)); -} - static bool same_equiv_note_p (set_info *set1, set_info *set2) { @@ -2374,6 +2366,7 @@ public: } void compute_vsetvl_def_data (); + void compute_transparent (const bb_info *); void compute_lcm_local_properties (); void fuse_local_vsetvl_info (); @@ -2452,20 +2445,16 @@ pre_vsetvl::compute_vsetvl_def_data () { for (unsigned i = 0; i < m_vsetvl_def_exprs.length (); i += 1) { - const vsetvl_info &info = *m_vsetvl_def_exprs[i]; - if (!info.has_nonvlmax_reg_avl ()) - continue; - unsigned int regno; - sbitmap_iterator sbi; - EXECUTE_IF_SET_IN_BITMAP (m_reg_def_loc[bb->index ()], 0, regno, - sbi) - if (regno == REGNO (info.get_avl ())) - { - bitmap_set_bit (m_kill[bb->index ()], i); - bitmap_set_bit (def_loc[bb->index ()], - get_expr_index (m_vsetvl_def_exprs, - m_unknow_info)); - } + auto *info = m_vsetvl_def_exprs[i]; + if (info->has_nonvlmax_reg_avl () + && bitmap_bit_p (m_reg_def_loc[bb->index ()], + REGNO (info->get_avl ()))) + { + bitmap_set_bit (m_kill[bb->index ()], i); + bitmap_set_bit (def_loc[bb->index ()], + get_expr_index (m_vsetvl_def_exprs, + m_unknow_info)); + } } continue; } @@ -2516,6 +2505,36 @@ pre_vsetvl::compute_vsetvl_def_data () sbitmap_vector_free (m_kill); } +/* Subroutine of compute_lcm_local_properties which Compute local transparent + BB. Note that the compile time is very sensitive to compute_transparent and + compute_lcm_local_properties, any change of these 2 functions should be + aware of the compile time changing of the program which has a large number of + blocks, e.g SPEC 2017 wrf. + + Current compile time profile of SPEC 2017 wrf: + + 1. scheduling - 27% + 2. machine dep reorg (VSETVL PASS) - 18% + + VSETVL pass should not spend more time than scheduling in compilation. */ +void +pre_vsetvl::compute_transparent (const bb_info *bb) +{ + int num_exprs = m_exprs.length (); + unsigned bb_index = bb->index (); + for (int i = 0; i < num_exprs; i++) + { + auto *info = m_exprs[i]; + if (info->has_nonvlmax_reg_avl () + && bitmap_bit_p (m_reg_def_loc[bb_index], REGNO (info->get_avl ()))) + bitmap_clear_bit (m_transp[bb_index], i); + else if (info->has_vl () + && bitmap_bit_p (m_reg_def_loc[bb_index], + REGNO (info->get_vl ()))) + bitmap_clear_bit (m_transp[bb_index], i); + } +} + /* Compute the local properties of each recorded expression. Local properties are those that are defined by the block, irrespective of @@ -2572,7 +2591,7 @@ pre_vsetvl::compute_lcm_local_properties () bitmap_vector_clear (m_avloc, last_basic_block_for_fn (cfun)); bitmap_vector_clear (m_antloc, last_basic_block_for_fn (cfun)); - bitmap_vector_clear (m_transp, last_basic_block_for_fn (cfun)); + bitmap_vector_ones (m_transp, last_basic_block_for_fn (cfun)); /* - If T is locally available at the end of a block, then T' must be available at the end of the same block. Since some optimization has @@ -2598,117 +2617,34 @@ pre_vsetvl::compute_lcm_local_properties () /* Compute m_transp */ if (block_info.empty_p ()) + compute_transparent (bb); + else { - bitmap_ones (m_transp[bb_index]); - for (int i = 0; i < num_exprs; i += 1) - { - const vsetvl_info &info = *m_exprs[i]; - if (!info.has_nonvlmax_reg_avl () && !info.has_vl ()) - continue; - - if (info.has_nonvlmax_reg_avl ()) - { - unsigned int regno; - sbitmap_iterator sbi; - EXECUTE_IF_SET_IN_BITMAP (m_reg_def_loc[bb->index ()], 0, - regno, sbi) - { - if (regno == REGNO (info.get_avl ())) - bitmap_clear_bit (m_transp[bb->index ()], i); - } - } - - for (insn_info *insn : bb->real_nondebug_insns ()) - { - if (info.has_nonvlmax_reg_avl () - && find_access (insn->defs (), REGNO (info.get_avl ()))) - { - bitmap_clear_bit (m_transp[bb_index], i); - break; - } - - if (info.has_vl () - && reg_mentioned_p (info.get_vl (), insn->rtl ())) - { - if (find_access (insn->defs (), REGNO (info.get_vl ()))) - /* We can't fuse vsetvl into the blocks that modify the - VL operand since successors of such blocks will need - the value of those blocks are defining. - - bb 4: def a5 - / \ - bb 5:use a5 bb 6:vsetvl a5, 5 - - The example above shows that we can't fuse vsetvl - from bb 6 into bb 4 since the successor bb 5 is using - the value defined in bb 4. */ - ; - else - { - /* We can't fuse vsetvl into the blocks that use the - VL operand which has different value from the - vsetvl info. - - bb 4: def a5 - | - bb 5: use a5 - | - bb 6: def a5 - | - bb 7: use a5 - - The example above shows that we can't fuse vsetvl - from bb 6 into bb 5 since their value is different. - */ - resource_info resource - = full_register (REGNO (info.get_vl ())); - def_lookup dl = crtl->ssa->find_def (resource, insn); - def_info *def - = dl.matching_set_or_last_def_of_prev_group (); - insn_info *def_insn = extract_single_source (def); - if (def_insn && vsetvl_insn_p (def_insn->rtl ())) - { - vsetvl_info def_info = vsetvl_info (def_insn); - if (m_dem.compatible_p (def_info, info)) - continue; - } - } + bitmap_clear (m_transp[bb_index]); + vsetvl_info &header_info = block_info.get_entry_info (); + vsetvl_info &footer_info = block_info.get_exit_info (); - bitmap_clear_bit (m_transp[bb_index], i); - break; - } - } - } + if (header_info.valid_p () && anticipated_exp_p (header_info)) + bitmap_set_bit (m_antloc[bb_index], + get_expr_index (m_exprs, header_info)); - continue; + if (footer_info.valid_p ()) + for (int i = 0; i < num_exprs; i += 1) + { + const vsetvl_info &info = *m_exprs[i]; + if (!info.valid_p ()) + continue; + if (available_exp_p (footer_info, info)) + bitmap_set_bit (m_avloc[bb_index], i); + } } - vsetvl_info &header_info = block_info.get_entry_info (); - vsetvl_info &footer_info = block_info.get_exit_info (); - - if (header_info.valid_p () && anticipated_exp_p (header_info)) - bitmap_set_bit (m_antloc[bb_index], - get_expr_index (m_exprs, header_info)); - - if (footer_info.valid_p ()) - for (int i = 0; i < num_exprs; i += 1) - { - const vsetvl_info &info = *m_exprs[i]; - if (!info.valid_p ()) - continue; - if (available_exp_p (footer_info, info)) - bitmap_set_bit (m_avloc[bb_index], i); - } - } - - for (const bb_info *bb : crtl->ssa->bbs ()) - { - unsigned bb_index = bb->index (); if (invalid_opt_bb_p (bb->cfg_bb ())) { bitmap_clear (m_antloc[bb_index]); bitmap_clear (m_transp[bb_index]); } + /* Compute ae_kill for each basic block using: ~(TRANSP | COMP) -- 2.36.1