On Tue, Mar 20, 2012 at 2:04 PM, Sriraman Tallam <tmsri...@google.com> wrote: > This patch adds support to version for corei7 with -mvarch option. The > versioning supported is in the case where a loop generates a LCP stalling > instruction in corei7. In such cases, on corei7, limiting the unroll factor > to try to keep the unrolled loop body small enough to fit in the Corei7's > loop stream detector can hide LCP stalls in loops. With mvarch, the function > containing the loop is multi-versioned and one version is tagged with > "tune=corei7" so that the unroll factor can be limited on this version. > > Please see: http://gcc.gnu.org/ml/gcc-patches/2011-12/msg01230.html for > discussion on mvarch option. > Please see: http://gcc.gnu.org/ml/gcc-patches/2011-12/msg00123.html for > discussion on LCP stalls in corei7. > > > The autocloning framework is only avaiable in google/gcc-4_6 branch. I am > working on porting this to trunk. > > * config/i386/i386.c (find_himode_assigns): New function. > (mversionable_for_core2_p): Add new param version_number. > (mversionable_for_corei7_p): New function. > (ix86_mversion_function): Check for corei7 versioning. > * params.def (PARAM_MAX_FUNCTION_SIZE_FOR_AUTO_CLONING): Bump > allowed limit to 5000. > * mversn-dispatch.c (do_auto_clone): Reverse fn_ver_addr_chain. > > Index: config/i386/i386.c > =================================================================== > --- config/i386/i386.c (revision 185514) > +++ config/i386/i386.c (working copy) > @@ -26507,6 +26507,132 @@ any_loops_vectorizable_with_load_store (void) > return vectorizable_loop_found; > } > > +/* Returns true if this function finds a loop that contains a possible LCP > + stalling instruction on corei7. This is used to multiversion functions > + for corei7. > + > + This function looks for instructions that store a constant into > + HImode (16-bit) memory. These require a length-changing prefix and on > + corei7 are prone to LCP stalls. These stalls can be avoided if the loop > + is streamed from the loop stream detector. */ > + > +static bool > +find_himode_assigns (void) > +{ > + gimple_stmt_iterator gsi; > + gimple stmt; > + enum gimple_code code; > + tree lhs/*, rhs*/;
Can rhs be removed? > + enum machine_mode mode; > + basic_block *body; > + unsigned i; > + loop_iterator li; > + struct loop *loop; > + bool found = false; > + location_t locus = 0; locus is dead (assigned but not read). > + int stmt_count; > + unsigned HOST_WIDE_INT n_unroll, max_unroll; > + > + if (!flag_unroll_loops) > + return false; > + > + loop_optimizer_init (LOOPS_NORMAL > + | LOOPS_HAVE_RECORDED_EXITS); > + if (number_of_loops () < 1) > + return false; > + > + scev_initialize(); > + > + if (profile_status == PROFILE_READ) > + max_unroll = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES_FEEDBACK); > + else > + max_unroll = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES); It might be clearer to rename max_unroll to max_peel_times or something like that to be clearer. > + > + FOR_EACH_LOOP (li, loop, LI_ONLY_INNERMOST) > + { > + tree niter; > + > + /* Will not peel/unroll cold areas. */ > + if (optimize_loop_for_size_p (loop)) > + continue; > + > + /* Can the loop be manipulated? */ > + if (!can_duplicate_loop_p (loop)) > + continue; > + > + niter = number_of_latch_executions (loop); > + if (host_integerp (niter, 1)) > + { > + n_unroll = tree_low_cst (niter, 1); > + if (n_unroll <= max_unroll) > + continue; > + } > + > + body = get_loop_body (loop); > + found = false; > + stmt_count = 0; > + > + for (i = 0; i < loop->num_nodes; i++) > + { > + for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next > (&gsi)) > + { > + stmt = gsi_stmt (gsi); > + stmt_count++; > + if (found) > + continue; > + code = gimple_code (stmt); > + if (code != GIMPLE_ASSIGN) > + continue; > + lhs = gimple_assign_lhs (stmt); > + if (TREE_CODE (lhs) != MEM_REF && > + TREE_CODE (lhs) != COMPONENT_REF && > + TREE_CODE (lhs) != ARRAY_REF) > + continue; > + if (gimple_assign_rhs_code(stmt) != INTEGER_CST) > + continue; > + mode = TYPE_MODE (TREE_TYPE (lhs)); > + if (mode == HImode) > + { > + locus = gimple_location (stmt); > + found = true; > + } > + } > + } > + /* Don't worry about large loops that won't be unrolled anyway. In > fact, > + * don't worry about unrolling loops that are already over the size of > the > + * LSD (28 insts). Since instruction counts may be a little off at this > + * point, due to downstream transformations, include loops a little > bigger > + * than the LSD size. > + */ > + if (found && stmt_count < 40) > + { > + n_unroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS)/stmt_count; > + /* Check for a simple peel candidate */ > + if (!(loop->header->count > + && expected_loop_iterations (loop) < 2 * n_unroll)) > + { > + location_t locus2; locus2 is dead (assigned but not read) and can be removed. > + edge exit; > + if ((exit = single_exit(loop)) != NULL) > + locus2 = gimple_location(last_stmt(exit->src)); > + else > + locus2 = gimple_location(first_stmt(loop->latch)); > + } > + else > + found = false; > + } > + else > + found = false; > + free (body); > + } > + > + scev_finalize(); > + loop_optimizer_finalize(); > + return found; > +} > + > /* This makes the function that chooses the version to execute based > on the condition. This condition function will decide which version > of the function to execute. It should look like this: > @@ -26601,7 +26727,7 @@ create_mtune_target_opt_node (const char *arch_tun > the newly created version, that is tag "tune=core2" on the new version. */ > > static bool > -mversionable_for_core2_p (tree *optimization_node, > +mversionable_for_core2_p (int version_number, tree *optimization_node, > tree *cond_func_decl, basic_block *new_bb) > { > tree predicate_decl; > @@ -26635,11 +26761,60 @@ static bool > *optimization_node = create_mtune_target_opt_node ("core2"); > > predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS_INTEL_CORE2]; > - *new_bb = add_condition_to_bb (*cond_func_decl, 0, *new_bb, > + *new_bb = add_condition_to_bb (*cond_func_decl, version_number, *new_bb, > predicate_decl, false); > return true; > } > > +/* Should a version of this function be specially optimized for core2? > + > + This function should have checks to see if there are any opportunities for > + core2 specific optimizations, otherwise do not create a clone. The > + following opportunities are checked. The above 2 references to core2 should be changed to corei7? > + > + * Check if there are any loops that contain LCP stalling instruction on > + corei7. > + > + This versioning is triggered only when multi-versioning for corei7 is > + requested using -mvarch=corei7. > + > + Return false if no versioning is required. Return true if a version must > + be created. Generate the *OPTIMIZATION_NODE that must be used to optimize > + the newly created version, that is tag "tune=corei7" on the new version. > */ > + > +static bool > +mversionable_for_corei7_p (int version_number, tree *optimization_node, > + tree *cond_func_decl, basic_block *new_bb) > +{ > + tree predicate_decl; > + bool is_mversion_target_corei7 = false; > + bool create_version = false; > + > + if (ix86_varch_specified > + && (ix86_varch[PROCESSOR_COREI7_64] > + || ix86_varch[PROCESSOR_COREI7_32])) > + is_mversion_target_corei7 = true; > + > + if (is_mversion_target_corei7 && find_himode_assigns ()) > + create_version = true; > + /* else if XXX: Add more criteria to version for corei7. */ > + > + if (!create_version) > + return false; > + > + /* If the condition function's body has not been created, create it now. > */ > + if (*cond_func_decl == NULL) > + *cond_func_decl = make_condition_function (new_bb); > + > + *optimization_node = create_mtune_target_opt_node ("corei7"); > + > + predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS_INTEL_COREI7]; > + *new_bb = add_condition_to_bb (*cond_func_decl, version_number, *new_bb, > + predicate_decl, false); > + return true; > +} > + > + > /* Should this function CURRENT_FUNCTION_DECL be multi-versioned, if so > the number of versions to be created (other than the original) is > returned. The outcome of COND_FUNC_DECL will decide the version to be > @@ -26654,19 +26829,33 @@ ix86_mversion_function (tree fndecl ATTRIBUTE_UNUS > basic_block new_bb; > tree optimization_node; > int num_versions_created = 0; > - > + > if (ix86_mv_arch_string == NULL) > return 0; > > - if (mversionable_for_core2_p (&optimization_node, cond_func_decl, &new_bb)) > - num_versions_created++; > + if (mversionable_for_core2_p (num_versions_created, &optimization_node, > + cond_func_decl, &new_bb)) > + { > + num_versions_created++; > + *optimization_node_chain = tree_cons (optimization_node, > + NULL_TREE, > + *optimization_node_chain); > + } > > + if (mversionable_for_corei7_p (num_versions_created, &optimization_node, > + cond_func_decl, &new_bb)) > + { > + num_versions_created++; > + *optimization_node_chain = tree_cons (optimization_node, > + NULL_TREE, > + *optimization_node_chain); > + } > + > + *optimization_node_chain = nreverse (*optimization_node_chain); What is the nreverse here and below needed for? Thanks, Teresa > + > if (!num_versions_created) > return 0; > > - *optimization_node_chain = tree_cons (optimization_node, > - NULL_TREE, *optimization_node_chain); > - > /* Return the default version as the last stmt in cond_func_decl. */ > if (*cond_func_decl != NULL) > new_bb = add_condition_to_bb (*cond_func_decl, num_versions_created, > Index: params.def > =================================================================== > --- params.def (revision 185514) > +++ params.def (working copy) > @@ -1040,7 +1040,7 @@ DEFPARAM (PARAM_PMU_PROFILE_N_ADDRESS, > DEFPARAM (PARAM_MAX_FUNCTION_SIZE_FOR_AUTO_CLONING, > "autoclone-function-size-limit", > "Do not auto clone functions beyond this size.", > - 450, 0, 100000) > + 5000, 0, 100000) > > /* > Local variables: > Index: mversn-dispatch.c > =================================================================== > --- mversn-dispatch.c (revision 185514) > +++ mversn-dispatch.c (working copy) > @@ -2423,6 +2423,8 @@ do_auto_clone (void) > opt_node = TREE_CHAIN (opt_node); > } > > + fn_ver_addr_chain = nreverse (fn_ver_addr_chain); > + > /* The current function is replaced by an ifunc call to the right version. > Make another clone for the default. */ > default_decl = clone_function (current_function_decl, "autoclone.original"); > @@ -2432,6 +2434,7 @@ do_auto_clone (void) > default_ver = build_fold_addr_expr (default_decl); > cond_func_addr = build_fold_addr_expr (cond_func_decl); > > + > /* Get the gimple sequence to replace the current function's body with a > ifunc dispatch call to the right version. */ > gseq = dispatch_using_ifunc (num_versions, current_function_decl, > > -- > This patch is available for review at http://codereview.appspot.com/5865043 -- Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413