On Tue, Mar 20, 2012 at 2:04 PM, Sriraman Tallam <tmsri...@google.com> wrote:
> This patch adds support to version for corei7 with -mvarch option. The 
> versioning supported is in the case where a loop generates a LCP stalling 
> instruction in corei7. In such cases, on corei7, limiting the unroll factor 
> to try to keep the unrolled loop body small enough to fit in the Corei7's 
> loop stream detector can hide LCP stalls in loops. With mvarch, the function 
> containing the loop is multi-versioned and one version is tagged with 
> "tune=corei7" so that the unroll factor can be limited on this version.
>
> Please see: http://gcc.gnu.org/ml/gcc-patches/2011-12/msg01230.html for 
> discussion on mvarch option.
> Please see: http://gcc.gnu.org/ml/gcc-patches/2011-12/msg00123.html for 
> discussion  on LCP stalls in corei7.
>
>
> The autocloning framework is only avaiable in google/gcc-4_6 branch. I am 
> working on porting this to trunk.
>
>        * config/i386/i386.c (find_himode_assigns): New function.
>        (mversionable_for_core2_p): Add new param version_number.
>        (mversionable_for_corei7_p): New function.
>        (ix86_mversion_function): Check for corei7 versioning.
>        * params.def (PARAM_MAX_FUNCTION_SIZE_FOR_AUTO_CLONING): Bump
>        allowed limit to 5000.
>        *  mversn-dispatch.c (do_auto_clone): Reverse fn_ver_addr_chain.
>
> Index: config/i386/i386.c
> ===================================================================
> --- config/i386/i386.c  (revision 185514)
> +++ config/i386/i386.c  (working copy)
> @@ -26507,6 +26507,132 @@ any_loops_vectorizable_with_load_store (void)
>   return vectorizable_loop_found;
>  }
>
> +/* Returns true if this function finds a loop that contains a possible LCP
> +   stalling instruction on corei7.   This is used to multiversion functions
> +   for corei7.
> +
> +   This function looks for instructions that store a constant into
> +   HImode (16-bit) memory. These require a length-changing prefix and on
> +   corei7 are prone to LCP stalls. These stalls can be avoided if the loop
> +   is streamed from the loop stream detector.  */
> +
> +static bool
> +find_himode_assigns (void)
> +{
> +  gimple_stmt_iterator gsi;
> +  gimple stmt;
> +  enum gimple_code code;
> +  tree lhs/*, rhs*/;

Can rhs be removed?

> +  enum machine_mode mode;
> +  basic_block *body;
> +  unsigned i;
> +  loop_iterator li;
> +  struct loop *loop;
> +  bool found = false;
> +  location_t locus = 0;

locus is dead (assigned but not read).

> +  int stmt_count;
> +  unsigned HOST_WIDE_INT n_unroll, max_unroll;
> +
> +  if (!flag_unroll_loops)
> +    return false;
> +
> +  loop_optimizer_init (LOOPS_NORMAL
> +                       | LOOPS_HAVE_RECORDED_EXITS);
> +  if (number_of_loops () < 1)
> +    return false;
> +
> +  scev_initialize();
> +
> +  if (profile_status == PROFILE_READ)
> +    max_unroll = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES_FEEDBACK);
> +  else
> +    max_unroll = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);

It might be clearer to rename max_unroll to max_peel_times or
something like that to be clearer.

> +
> +  FOR_EACH_LOOP (li, loop, LI_ONLY_INNERMOST)
> +    {
> +      tree niter;
> +
> +      /* Will not peel/unroll cold areas.  */
> +      if (optimize_loop_for_size_p (loop))
> +        continue;
> +
> +      /* Can the loop be manipulated?  */
> +      if (!can_duplicate_loop_p (loop))
> +        continue;
> +
> +      niter = number_of_latch_executions (loop);
> +      if (host_integerp (niter, 1))
> +       {
> +         n_unroll = tree_low_cst (niter, 1);
> +         if (n_unroll <= max_unroll)
> +           continue;
> +       }
> +
> +      body = get_loop_body (loop);
> +      found = false;
> +      stmt_count = 0;
> +
> +      for (i = 0; i < loop->num_nodes; i++)
> +       {
> +         for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next 
> (&gsi))
> +           {
> +             stmt = gsi_stmt (gsi);
> +             stmt_count++;
> +             if (found)
> +               continue;
> +             code = gimple_code (stmt);
> +             if (code != GIMPLE_ASSIGN)
> +               continue;
> +             lhs = gimple_assign_lhs (stmt);
> +             if (TREE_CODE (lhs) != MEM_REF &&
> +                 TREE_CODE (lhs) != COMPONENT_REF &&
> +                 TREE_CODE (lhs) != ARRAY_REF)
> +               continue;
> +             if (gimple_assign_rhs_code(stmt) != INTEGER_CST)
> +               continue;
> +             mode = TYPE_MODE (TREE_TYPE (lhs));
> +             if (mode == HImode)
> +               {
> +                 locus = gimple_location (stmt);
> +                 found = true;
> +               }
> +          }
> +       }
> +      /* Don't worry about large loops that won't be unrolled anyway. In 
> fact,
> +       * don't worry about unrolling loops that are already over the size of 
> the
> +       * LSD (28 insts). Since instruction counts may be a little off at this
> +       * point, due to downstream transformations, include loops a little 
> bigger
> +       * than the LSD size.
> +       */
> +      if (found && stmt_count < 40)
> +       {
> +         n_unroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS)/stmt_count;
> +         /* Check for a simple peel candidate */
> +         if (!(loop->header->count
> +               && expected_loop_iterations (loop) < 2 * n_unroll))
> +           {
> +             location_t locus2;

locus2 is dead (assigned but not read) and can be removed.

> +             edge exit;
> +             if ((exit = single_exit(loop)) != NULL)
> +               locus2 = gimple_location(last_stmt(exit->src));
> +             else
> +               locus2 = gimple_location(first_stmt(loop->latch));
> +           }
> +         else
> +           found = false;
> +       }
> +      else
> +       found = false;
> +      free (body);
> +    }
> +
> +  scev_finalize();
> +  loop_optimizer_finalize();
> +  return found;
> +}
> +
>  /* This makes the function that chooses the version to execute based
>    on the condition.  This condition function will decide which version
>    of the function to execute.  It should look like this:
> @@ -26601,7 +26727,7 @@ create_mtune_target_opt_node (const char *arch_tun
>    the newly created version, that is tag "tune=core2" on the new version.  */
>
>  static bool
> -mversionable_for_core2_p (tree *optimization_node,
> +mversionable_for_core2_p (int version_number, tree *optimization_node,
>                          tree *cond_func_decl, basic_block *new_bb)
>  {
>   tree predicate_decl;
> @@ -26635,11 +26761,60 @@ static bool
>   *optimization_node = create_mtune_target_opt_node ("core2");
>
>   predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS_INTEL_CORE2];
> -  *new_bb = add_condition_to_bb (*cond_func_decl, 0, *new_bb,
> +  *new_bb = add_condition_to_bb (*cond_func_decl, version_number, *new_bb,
>                                 predicate_decl, false);
>   return true;
>  }
>
> +/* Should a version of this function be specially optimized for core2?
> +
> +   This function should have checks to see if there are any opportunities for
> +   core2 specific optimizations, otherwise do not create a clone.  The
> +   following opportunities are checked.

The above 2 references to core2 should be changed to corei7?

> +
> +   * Check if there are any loops that contain LCP stalling instruction on
> +     corei7.
> +
> +   This versioning is triggered only when multi-versioning for corei7 is
> +   requested using -mvarch=corei7.
> +
> +   Return false if no versioning is required.  Return true if a version must
> +   be created.  Generate the *OPTIMIZATION_NODE that must be used to optimize
> +   the newly created version, that is tag "tune=corei7" on the new version.  
> */
> +
> +static bool
> +mversionable_for_corei7_p (int version_number, tree *optimization_node,
> +                          tree *cond_func_decl, basic_block *new_bb)
> +{
> +  tree predicate_decl;
> +  bool is_mversion_target_corei7 = false;
> +  bool create_version = false;
> +
> +  if (ix86_varch_specified
> +      && (ix86_varch[PROCESSOR_COREI7_64]
> +         || ix86_varch[PROCESSOR_COREI7_32]))
> +    is_mversion_target_corei7 = true;
> +
> +  if (is_mversion_target_corei7 && find_himode_assigns ())
> +    create_version = true;
> +  /* else if XXX: Add more criteria to version for corei7.  */
> +
> +  if (!create_version)
> +    return false;
> +
> +  /* If the condition function's body has not been created, create it now.  
> */
> +  if (*cond_func_decl == NULL)
> +    *cond_func_decl = make_condition_function (new_bb);
> +
> +  *optimization_node = create_mtune_target_opt_node ("corei7");
> +
> +  predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS_INTEL_COREI7];
> +  *new_bb = add_condition_to_bb (*cond_func_decl, version_number, *new_bb,
> +                                predicate_decl, false);
> +  return true;
> +}
> +
> +
>  /* Should this function CURRENT_FUNCTION_DECL be multi-versioned, if so
>    the number of versions to be created (other than the original) is
>    returned.  The outcome of COND_FUNC_DECL will decide the version to be
> @@ -26654,19 +26829,33 @@ ix86_mversion_function (tree fndecl ATTRIBUTE_UNUS
>   basic_block new_bb;
>   tree optimization_node;
>   int num_versions_created = 0;
> -
> +
>   if (ix86_mv_arch_string == NULL)
>     return 0;
>
> -  if (mversionable_for_core2_p (&optimization_node, cond_func_decl, &new_bb))
> -    num_versions_created++;
> +  if (mversionable_for_core2_p (num_versions_created, &optimization_node,
> +                               cond_func_decl, &new_bb))
> +    {
> +      num_versions_created++;
> +      *optimization_node_chain = tree_cons (optimization_node,
> +                                           NULL_TREE,
> +                                           *optimization_node_chain);
> +    }
>
> +  if (mversionable_for_corei7_p (num_versions_created, &optimization_node,
> +                                cond_func_decl, &new_bb))
> +    {
> +      num_versions_created++;
> +      *optimization_node_chain = tree_cons (optimization_node,
> +                                           NULL_TREE,
> +                                           *optimization_node_chain);
> +    }
> +
> +  *optimization_node_chain = nreverse (*optimization_node_chain);

What is the nreverse here and below needed for?

Thanks,
Teresa

> +
>   if (!num_versions_created)
>     return 0;
>
> -  *optimization_node_chain = tree_cons (optimization_node,
> -                                       NULL_TREE, *optimization_node_chain);
> -
>   /* Return the default version as the last stmt in cond_func_decl.  */
>   if (*cond_func_decl != NULL)
>     new_bb = add_condition_to_bb (*cond_func_decl, num_versions_created,
> Index: params.def
> ===================================================================
> --- params.def  (revision 185514)
> +++ params.def  (working copy)
> @@ -1040,7 +1040,7 @@ DEFPARAM (PARAM_PMU_PROFILE_N_ADDRESS,
>  DEFPARAM (PARAM_MAX_FUNCTION_SIZE_FOR_AUTO_CLONING,
>          "autoclone-function-size-limit",
>          "Do not auto clone functions beyond this size.",
> -         450, 0, 100000)
> +         5000, 0, 100000)
>
>  /*
>  Local variables:
> Index: mversn-dispatch.c
> ===================================================================
> --- mversn-dispatch.c   (revision 185514)
> +++ mversn-dispatch.c   (working copy)
> @@ -2423,6 +2423,8 @@ do_auto_clone (void)
>       opt_node = TREE_CHAIN (opt_node);
>     }
>
> +  fn_ver_addr_chain = nreverse (fn_ver_addr_chain);
> +
>   /* The current function is replaced by an ifunc call to the right version.
>      Make another clone for the default.  */
>   default_decl = clone_function (current_function_decl, "autoclone.original");
> @@ -2432,6 +2434,7 @@ do_auto_clone (void)
>   default_ver = build_fold_addr_expr (default_decl);
>   cond_func_addr = build_fold_addr_expr (cond_func_decl);
>
> +
>   /* Get the gimple sequence to replace the current function's body with a
>      ifunc dispatch call to the right version.  */
>   gseq = dispatch_using_ifunc (num_versions, current_function_decl,
>
> --
> This patch is available for review at http://codereview.appspot.com/5865043



-- 
Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413

Reply via email to