On Mon, Jun 6, 2016 at 3:19 AM, Jan Hubicka <hubi...@ucw.cz> wrote:
> Hi,
> while looking into profile mismatches introduced by the backward threading 
> pass
> I noticed that the heuristics seems quite simplistics.  First it should be
> profile sensitive and disallow duplication when optimizing cold paths. Second
> it should use estimate_num_insns because gimple statement count is not really
> very realistic estimate of final code size effect and third there seems to be
> no reason to disable the pass for functions optimized for size.
>
> If we block duplication for more than 1 insns for size optimized paths the 
> pass
> is able to do majority of threading decisions that are for free and improve 
> codegen.
> The code size benefit was between 0.5% to 2.7% on testcases I tried (tramp3d,
> GCC modules, xlanancbmk and some other stuff around my hd).
>
> Bootstrapped/regtested x86_64-linux, seems sane?
>
> The pass should also avoid calling cleanup_cfg when no trheading was done
> and i do not see why it is guarded by expensive_optimizations. What are the
> main compile time complexity limitations?

This patch caused a huge regression (~11%) on coremarks on ThunderX.
I assume other targets too.
Basically it looks like the path is no longer thread jumped.

Thanks,
Andrew


>
> Honza
>
>         * tree-ssa-threadbackward.c: Include tree-inline.h
>         (profitable_jump_thread_path): Use estimate_num_insns to estimate
>         size of copied block; for cold paths reduce duplication.
>         (find_jump_threads_backwards): Remove redundant tests.
>         (pass_thread_jumps::gate): Enable for -Os.
>         * gcc.dg/tree-ssa/ssa-dom-thread-7.c: Update testcase.
> Index: tree-ssa-threadbackward.c
> ===================================================================
> --- tree-ssa-threadbackward.c   (revision 237101)
> +++ tree-ssa-threadbackward.c   (working copy)
> @@ -35,6 +35,7 @@ along with GCC; see the file COPYING3.
>  #include "tree-pass.h"
>  #include "gimple-ssa.h"
>  #include "tree-phinodes.h"
> +#include "tree-inline.h"
>
>  static int max_threaded_paths;
>
> @@ -210,7 +211,7 @@ profitable_jump_thread_path (vec<basic_b
>                   && !(gimple_code (stmt) == GIMPLE_ASSIGN
>                        && gimple_assign_rhs_code (stmt) == ASSERT_EXPR)
>                   && !is_gimple_debug (stmt))
> -               ++n_insns;
> +               n_insns += estimate_num_insns (stmt, &eni_size_weights);
>             }
>
>           /* We do not look at the block with the threaded branch
> @@ -238,13 +239,15 @@ profitable_jump_thread_path (vec<basic_b
>         threaded_through_latch = true;
>      }
>
> +  gimple *stmt = get_gimple_control_stmt ((*path)[0]);
> +  gcc_assert (stmt);
> +
>    /* We are going to remove the control statement at the end of the
>       last block in the threading path.  So don't count it against our
>       statement count.  */
> -  n_insns--;
>
> -  gimple *stmt = get_gimple_control_stmt ((*path)[0]);
> -  gcc_assert (stmt);
> +  n_insns-= estimate_num_insns (stmt, &eni_size_weights);
> +
>    /* We have found a constant value for ARG.  For GIMPLE_SWITCH
>       and GIMPLE_GOTO, we use it as-is.  However, for a GIMPLE_COND
>       we need to substitute, fold and simplify so we can determine
> @@ -290,12 +293,24 @@ profitable_jump_thread_path (vec<basic_b
>        return NULL;
>      }
>
> -  if (n_insns >= PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATH_INSNS))
> +  if (optimize_edge_for_speed_p (taken_edge))
> +    {
> +      if (n_insns >= PARAM_VALUE (PARAM_MAX_FSM_THREAD_PATH_INSNS))
> +       {
> +         if (dump_file && (dump_flags & TDF_DETAILS))
> +           fprintf (dump_file, "FSM jump-thread path not considered: "
> +                    "the number of instructions on the path "
> +                    "exceeds PARAM_MAX_FSM_THREAD_PATH_INSNS.\n");
> +         path->pop ();
> +         return NULL;
> +       }
> +    }
> +  else if (n_insns > 1)
>      {
>        if (dump_file && (dump_flags & TDF_DETAILS))
>         fprintf (dump_file, "FSM jump-thread path not considered: "
> -                "the number of instructions on the path "
> -                "exceeds PARAM_MAX_FSM_THREAD_PATH_INSNS.\n");
> +                "duplication of %i insns is needed and optimizing for 
> size.\n",
> +                n_insns);
>        path->pop ();
>        return NULL;
>      }
> @@ -600,10 +615,6 @@ fsm_find_control_statement_thread_paths
>  void
>  find_jump_threads_backwards (basic_block bb)
>  {
> -  if (!flag_expensive_optimizations
> -      || optimize_function_for_size_p (cfun))
> -    return;
> -
>    gimple *stmt = get_gimple_control_stmt (bb);
>    if (!stmt)
>      return;
> @@ -668,8 +679,7 @@ public:
>  bool
>  pass_thread_jumps::gate (function *fun ATTRIBUTE_UNUSED)
>  {
> -  return (flag_expensive_optimizations
> -         && ! optimize_function_for_size_p (cfun));
> +  return flag_expensive_optimizations;
>  }
>
>
> Index: testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c
> ===================================================================
> --- testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c        (revision 237101)
> +++ testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c        (working copy)
> @@ -1,7 +1,7 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -fdump-tree-thread1-stats -fdump-tree-thread2-stats 
> -fdump-tree-thread3-stats -fdump-tree-dom3-stats -fdump-tree-vrp2-stats" } */
> +/* { dg-options "-O2 -fdump-tree-thread1-stats -fdump-tree-thread2-stats 
> -fdump-tree-thread3-stats -fdump-tree-dom3-stats -fdump-tree-vrp2-stats 
> -fno-guess-branch-probability" } */
>  /* { dg-final { scan-tree-dump "Jumps threaded: 16"  "thread1" } } */
> -/* { dg-final { scan-tree-dump "Jumps threaded: 11" "thread2" } } */
> +/* { dg-final { scan-tree-dump "Jumps threaded: 6" "thread2" } } */
>  /* { dg-final { scan-tree-dump "Jumps threaded: 3" "thread3" } } */
>  /* { dg-final { scan-tree-dump-not "Jumps threaded"  "dom3" } } */
>  /* { dg-final { scan-tree-dump-not "Jumps threaded"  "vrp2" } } */

Reply via email to