On Sun, Jun 2, 2024 at 4:13 PM Feng Xue OS <f...@os.amperecomputing.com> wrote: > > Please see my comments below. > > Thanks, > Feng > > > On Thu, May 30, 2024 at 4:55 PM Feng Xue OS <f...@os.amperecomputing.com> > > wrote: > >> > >> For lane-reducing operation(dot-prod/widen-sum/sad) in loop reduction, > >> current > >> vectorizer could only handle the pattern if the reduction chain does not > >> contain other operation, no matter the other is normal or lane-reducing. > >> > >> Actually, to allow multiple arbitray lane-reducing operations, we need to > >> support vectorization of loop reduction chain with mixed input vectypes. > >> Since > >> lanes of vectype may vary with operation, the effective ncopies of > >> vectorized > >> statements for operation also may not be same to each other, this causes > >> mismatch on vectorized def-use cycles. A simple way is to align all > >> operations > >> with the one that has the most ncopies, the gap could be complemented by > >> generating extra trival pass-through copies. For example: > >> > >> int sum = 0; > >> for (i) > >> { > >> sum += d0[i] * d1[i]; // dot-prod <vector(16) char> > >> sum += w[i]; // widen-sum <vector(16) char> > >> sum += abs(s0[i] - s1[i]); // sad <vector(8) short> > >> sum += n[i]; // normal <vector(4) int> > >> } > >> > >> The vector size is 128-bit,vectorization factor is 16. Reduction statements > >> would be transformed as: > >> > >> vector<4> int sum_v0 = { 0, 0, 0, 0 }; > >> vector<4> int sum_v1 = { 0, 0, 0, 0 }; > >> vector<4> int sum_v2 = { 0, 0, 0, 0 }; > >> vector<4> int sum_v3 = { 0, 0, 0, 0 }; > >> > >> for (i / 16) > >> { > >> sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0); > >> sum_v1 = sum_v1; // copy > >> sum_v2 = sum_v2; // copy > >> sum_v3 = sum_v3; // copy > >> > >> sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0); > >> sum_v1 = sum_v1; // copy > >> sum_v2 = sum_v2; // copy > >> sum_v3 = sum_v3; // copy > >> > >> sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0); > >> sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1); > >> sum_v2 = sum_v2; // copy > >> sum_v3 = sum_v3; // copy > >> > >> sum_v0 += n_v0[i: 0 ~ 3 ]; > >> sum_v1 += n_v1[i: 4 ~ 7 ]; > >> sum_v2 += n_v2[i: 8 ~ 11]; > >> sum_v3 += n_v3[i: 12 ~ 15]; > >> } > >> > >> Thanks, > >> Feng > >> > >> ... > >> > >> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > >> index 20c99f11e9a..b5849dbb08a 100644 > >> --- a/gcc/tree-vect-loop.cc > >> +++ b/gcc/tree-vect-loop.cc > >> @@ -5322,8 +5322,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, > >> if (!gimple_extract_op (orig_stmt_info->stmt, &op)) > >> gcc_unreachable (); > >> > >> - bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod > >> (stmt_info); > >> - > >> if (reduction_type == EXTRACT_LAST_REDUCTION) > >> /* No extra instructions are needed in the prologue. The loop body > >> operations are costed in vectorizable_condition. */ > >> @@ -5358,12 +5356,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, > >> initial result of the data reduction, initial value of the index > >> reduction. */ > >> prologue_stmts = 4; > >> - else if (emulated_mixed_dot_prod) > >> - /* We need the initial reduction value and two invariants: > >> - one that contains the minimum signed value and one that > >> - contains half of its negative. */ > >> - prologue_stmts = 3; > >> else > >> + /* We need the initial reduction value. */ > >> prologue_stmts = 1; > >> prologue_cost += record_stmt_cost (cost_vec, prologue_stmts, > >> scalar_to_vec, stmt_info, 0, > >> @@ -7464,6 +7458,169 @@ vect_reduction_use_partial_vector (loop_vec_info > >> loop_vinfo, > >> } > >> } > >> > >> +/* Check if STMT_INFO is a lane-reducing operation that can be vectorized > >> in > >> + the context of LOOP_VINFO, and vector cost will be recorded in > >> COST_VEC. > >> + Now there are three such kinds of operations: dot-prod/widen-sum/sad > >> + (sum-of-absolute-differences). > >> + > >> + For a lane-reducing operation, the loop reduction path that it lies in, > >> + may contain normal operation, or other lane-reducing operation of > >> different > >> + input type size, an example as: > >> + > >> + int sum = 0; > >> + for (i) > >> + { > >> + ... > >> + sum += d0[i] * d1[i]; // dot-prod <vector(16) char> > >> + sum += w[i]; // widen-sum <vector(16) char> > >> + sum += abs(s0[i] - s1[i]); // sad <vector(8) short> > >> + sum += n[i]; // normal <vector(4) int> > >> + ... > >> + } > >> + > >> + Vectorization factor is essentially determined by operation whose input > >> + vectype has the most lanes ("vector(16) char" in the example), while we > >> + need to choose input vectype with the least lanes ("vector(4) int" in > >> the > >> + example) for the reduction PHI statement. */ > >> + > >> +bool > >> +vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info > >> stmt_info, > >> + slp_tree slp_node, stmt_vector_for_cost > >> *cost_vec) > >> +{ > >> + gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt); > >> + if (!stmt) > >> + return false; > >> + > >> + enum tree_code code = gimple_assign_rhs_code (stmt); > >> + > >> + if (!lane_reducing_op_p (code)) > >> + return false; > > > > Can you make sure to return false if STMT_VINFO_REDUC_IDX == -1 > > thus the op is not part of a reduction chain/path? > > > > As I planed, in the 2nd stage patches WIP, this function will also handle > lane-reducing operation that does not directly participate reduction, like: > > temp = dot_prod1 + dot_prod2; > sum += temp; > > In this case, STMT_VINFO_REDUC_IDX of dot_prod1/2 == -1 > > For current work, the check is needed to filter out non-reduction statement, > but since it is expected to be removed later, so the check is placed at a late > point. > > >> + tree type = TREE_TYPE (gimple_assign_lhs (stmt)); > >> + > >> + if (!INTEGRAL_TYPE_P (type) && !SCALAR_FLOAT_TYPE_P (type)) > >> + return false; > >> + > >> + /* Do not try to vectorize bit-precision reductions. */ > >> + if (!type_has_mode_precision_p (type)) > >> + return false; > >> + > >> + tree vectype_in = NULL_TREE; > >> + > >> + for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++) > >> + { > >> + stmt_vec_info def_stmt_info; > >> + slp_tree slp_op; > >> + tree op; > >> + tree vectype; > >> + enum vect_def_type dt; > >> + > >> + if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op, > >> + &slp_op, &dt, &vectype, &def_stmt_info)) > >> + { > >> + if (dump_enabled_p ()) > >> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > >> + "use not simple.\n"); > >> + return false; > >> + } > >> + > >> + if (!vectype) > >> + { > >> + vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE > >> (op), > >> + slp_op); > >> + if (!vectype) > >> + return false; > >> + } > >> + > >> + if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype)) > > > > Please avoid this during transform. > > This function is only for analysis not transform. > > >> + { > >> + if (dump_enabled_p ()) > >> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > >> + "incompatible vector types for invariants\n"); > >> + return false; > >> + } > >> + > >> + if (i == STMT_VINFO_REDUC_IDX (stmt_info)) > >> + continue; > >> + > >> + /* There should be at most one cycle def in the stmt. */ > >> + if (VECTORIZABLE_CYCLE_DEF (dt)) > >> + return false; > >> + > >> + /* To properly compute ncopies we are interested in the widest > >> + non-reduction input type in case we're looking at a widening > >> + accumulation that we later handle in vect transformation. */ > >> + if (!vectype_in > >> + || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) > >> + < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))))) > >> + vectype_in = vectype; > >> + } > >> + > >> + STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in; > > > > As said below I wonder where we would need STMT_VINFO_REDUC_VECTYPE_IN. > > At least you should avoid re-setting this when !cost_vec aka during > > transform, > > possibly instead asserting you re-compute the same type (or simply > > skip the above > > loop and set vectype_in from STMT_VINFO_REDUC_VECTYPE_IN which then > > gets a good use). > > Likewise. > > > > >> + stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt > >> (stmt_info)); > >> + > >> + /* TODO: Support lane-reducing operation that does not directly > >> participate > >> + in loop reduction. */ > >> + if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0) > >> + return false; > >> + > >> + /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not > >> + recoginized. */ > >> + gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def); > >> + gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION); > >> + > >> + tree vphi_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info); > >> + > >> + /* To accommodate lane-reducing operations of mixed input vectypes, > >> choose > >> + input vectype with the least lanes for the reduction PHI statement, > >> which > >> + would result in the most ncopies for vectorized reduction results. > >> */ > >> + if (!vphi_vectype_in > >> + || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) > >> + > GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE > >> (vphi_vectype_in))))) > >> + STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; > > > > Likewise. > > > >> + int ncopies_for_cost; > >> + > >> + if (slp_node) > >> + { > >> + /* Now lane-reducing operations in a slp node should only come from > >> + the same loop reduction path. */ > >> + gcc_assert (REDUC_GROUP_FIRST_ELEMENT (stmt_info)); > >> + ncopies_for_cost = 1; > >> + } > >> + else > >> + { > >> + ncopies_for_cost = vect_get_num_copies (loop_vinfo, vectype_in); > >> + gcc_assert (ncopies_for_cost >= 1); > >> + } > >> + > >> + if (vect_is_emulated_mixed_dot_prod (stmt_info)) > >> + { > >> + /* We need extra two invariants: one that contains the minimum > >> signed > >> + value and one that contains half of its negative. */ > >> + int prologue_stmts = 2; > >> + unsigned cost = record_stmt_cost (cost_vec, prologue_stmts, > >> + scalar_to_vec, stmt_info, 0, > >> + vect_prologue); > >> + if (dump_enabled_p ()) > >> + dump_printf (MSG_NOTE, "vectorizable_lane_reducing: " > >> + "extra prologue_cost = %d .\n", cost); > >> + > >> + /* Three dot-products and a subtraction. */ > >> + ncopies_for_cost *= 4; > >> + } > >> + > >> + record_stmt_cost (cost_vec, ncopies_for_cost, vector_stmt, stmt_info, 0, > >> + vect_body); > >> + > >> + vect_reduction_use_partial_vector (loop_vinfo, reduc_info, slp_node, > >> code, > >> + type, vectype_in); > >> + > >> + STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; > > > > Uh, so those all go through vect_transform_reduction. I see. > > > > I fail to see a check for whether the target supports the lane-reducing op. > > vectorizable_reduction only checks the last one. Currently the check > > might be redundant with what pattern recognition checks but it's still > > incomplete compared to the check in vectorizable_reduction. > > In the original vectorizable_reduction, the target support check is > deliberately > skipped for lane-reducing operations. The reason is part as you said, > moreover, > other check would always not be executed. > > if (single_defuse_cycle || lane_reduc_code_p) > { > gcc_assert (op.code != COND_EXPR); > > /* 4. Supportable by target? */ > bool ok = true; > > /* 4.1. check support for the operation in the loop > > This isn't necessary for the lane reduction codes, since they > can only be produced by pattern matching, and it's up to the > pattern matcher to test for support. The main reason for > specifically skipping this step is to avoid rechecking whether > mixed-sign dot-products can be implemented using signed > dot-products. */ > machine_mode vec_mode = TYPE_MODE (vectype_in); > if (!lane_reduc_code_p //<----------- skip > && !directly_supported_p (op.code, vectype_in, optab_vector)) > { > if (dump_enabled_p ()) > dump_printf (MSG_NOTE, "op not supported by target.\n"); > if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD) > || !vect_can_vectorize_without_simd_p (op.code)) > ok = false; > else > if (dump_enabled_p ()) > dump_printf (MSG_NOTE, "proceeding using word mode.\n"); > } > > // <----- always false for lane-reducing op > > if (vect_emulated_vector_p (vectype_in) > && !vect_can_vectorize_without_simd_p (op.code)) > { > if (dump_enabled_p ()) > dump_printf (MSG_NOTE, "using word mode not possible.\n"); > return false; > } > > > > >> + return true; > >> +} > >> + > >> /* Function vectorizable_reduction. > >> > >> Check if STMT_INFO performs a reduction operation that can be > >> vectorized. > >> @@ -7609,6 +7766,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> (gimple_bb (reduc_def_phi)->loop_father)); > >> unsigned reduc_chain_length = 0; > >> bool only_slp_reduc_chain = true; > >> + bool only_lane_reducing = true; > >> stmt_info = NULL; > >> slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL; > >> while (reduc_def != PHI_RESULT (reduc_def_phi)) > >> @@ -7659,9 +7817,16 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> return false; > >> } > >> } > >> - else if (!stmt_info) > >> - /* First non-conversion stmt. */ > >> - stmt_info = vdef; > >> + else > >> + { > >> + /* First non-conversion stmt. */ > >> + if (!stmt_info) > >> + stmt_info = vdef; > >> + > >> + if (!lane_reducing_op_p (op.code)) > >> + only_lane_reducing = false; > >> + } > >> + > >> reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)]; > >> reduc_chain_length++; > >> if (!stmt_info && slp_node) > >> @@ -7733,18 +7898,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> if (!type_has_mode_precision_p (op.type)) > >> return false; > >> > >> - /* For lane-reducing ops we're reducing the number of reduction PHIs > >> - which means the only use of that may be in the lane-reducing > >> operation. */ > >> - if (lane_reducing > >> - && reduc_chain_length != 1 > >> - && !only_slp_reduc_chain) > >> - { > >> - if (dump_enabled_p ()) > >> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > >> - "lane-reducing reduction with extra stmts.\n"); > >> - return false; > >> - } > >> - > >> /* Lane-reducing ops also never can be used in a SLP reduction group > >> since we'll mix lanes belonging to different reductions. But it's > >> OK to use them in a reduction chain or when the reduction group > >> @@ -7788,9 +7941,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> "use not simple.\n"); > >> return false; > >> } > >> - if (i == STMT_VINFO_REDUC_IDX (stmt_info)) > >> - continue; > >> - > > > > So within this loop we analyze the "main" operation, while I do not exactly > > remember why we skip the op leading to the PHI I don't understand why you > > want to look at it for the multi lane-reducing case (the accumulator > > always has the same type, no?). > > > > In any case this just looks at a single (the last) lane-reducing or even > > not lane-reducing op. > > > > This comparison is redundant, since it could be covered by the following > comparison statement. The change should have been placed to a separate > patch, but for convenience I made it here. > > /* For an IFN_COND_OP we might hit the reduction definition operand > twice (once as definition, once as else). */ > if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)]) > continue; > > /* There should be only one cycle def in the stmt, the one > leading to reduc_def. */ > if (VECTORIZABLE_CYCLE_DEF (dt)) > return false; > > >> /* For an IFN_COND_OP we might hit the reduction definition operand > >> twice (once as definition, once as else). */ > >> if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)]) > >> @@ -7836,17 +7986,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> } > >> if (!vectype_in) > >> vectype_in = STMT_VINFO_VECTYPE (phi_info); > >> - STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; > >> > >> - /* Each lane-reducing operation has its own input vectype, while > >> reduction > >> - PHI records the input vectype with least lanes. */ > >> - if (lane_reducing) > >> - STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in; > >> - > >> - enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE > >> (phi_info); > >> - STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type; > >> + /* If there is a normal (non-lane-reducing) operation in the loop > >> reduction > >> + path, to ensure there will be enough copies to hold vectorized > >> results of > >> + the operation, we need set the input vectype of the reduction PHI to > >> be > >> + same as the reduction output vectype somewhere, here is a suitable > >> place. > >> + Otherwise the input vectype is set to the one with the least lanes, > >> which > >> + can only be determined in vectorizable analysis routine of > >> lane-reducing > >> + operation. */ > > > > But we are using vectype_in to compute ncopies which is used in cost > > analysis. > > The vectype_in only impacts the cost analysis for lane-reducing op, since the > function vect_is_emulated_mixed_dot_prod need it, and this function is > referred > by cost analysis. In the previous patch, we bind the vectype_in to each > lane-reducing op and also adjust code of the function accordingly, then this > would not be a problem. > > > You say this might not be the final ncopies? Note the vectorization factor > > is > > already fixed as well as (output) vector types of the lane-reducing ops. So > > The vectype_in is incrementally updated during analyzing vectorizablility of > lane-reducing ops. So before transform, the type should be determined. > > > shouldn't we simply pick that up in the loop walking the use-def chain via > > REDUC_IDX at the start of this function? > > I thought about doing it in that way. Ok. will consider it again. > > > I'm unsure as to why we need > > STMT_VINFO_REDUC_VECTYPE_IN at all (I don't remember adding that), > > it should be readily available from operand analysis. The docs for that > > isn't very enlightening either (there's also REDUC_VECTYPE, in addition > > to VECTYPE - huh). > > For old code, in which only one lane-reducing op is allowed in loop > reduction, this type might be computed on-demand. > > But for multiple lane-reducing ops, we need to know the vectype_in types > of all ops in order to determine a proper vectype_in for PHI statement, if > traversing those ops and computing types on-demand would not a good > way. Additionally, during transform, originally cfg flow is broken and could > not be used. > > >> + if (!only_lane_reducing) > >> + STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = STMT_VINFO_VECTYPE > >> (phi_info); > >> + > >> + enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE > >> (phi_info); > >> + STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type; > >> /* If we have a condition reduction, see if we can simplify it further. > >> */ > >> - if (v_reduc_type == COND_REDUCTION) > >> + if (reduction_type == COND_REDUCTION) > >> { > >> if (slp_node) > >> return false; > >> @@ -8012,8 +8166,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> } > >> > >> STMT_VINFO_REDUC_CODE (reduc_info) = orig_code; > >> + reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); > >> > >> - vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); > >> if (reduction_type == TREE_CODE_REDUCTION) > >> { > >> /* Check whether it's ok to change the order of the computation. > >> @@ -8287,14 +8441,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> && loop_vinfo->suggested_unroll_factor == 1) > >> single_defuse_cycle = true; > >> > >> - if (single_defuse_cycle || lane_reducing) > >> + if (single_defuse_cycle && !lane_reducing) > >> { > >> gcc_assert (op.code != COND_EXPR); > >> > >> - /* 4. Supportable by target? */ > >> - bool ok = true; > >> - > >> - /* 4.1. check support for the operation in the loop > >> + /* 4. check support for the operation in the loop > >> > >> This isn't necessary for the lane reduction codes, since they > >> can only be produced by pattern matching, and it's up to the > >> @@ -8303,14 +8454,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> mixed-sign dot-products can be implemented using signed > >> dot-products. */ > >> machine_mode vec_mode = TYPE_MODE (vectype_in); > >> - if (!lane_reducing > >> - && !directly_supported_p (op.code, vectype_in, optab_vector)) > >> + if (!directly_supported_p (op.code, vectype_in, optab_vector)) > >> { > >> if (dump_enabled_p ()) > >> dump_printf (MSG_NOTE, "op not supported by target.\n"); > >> if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD) > >> || !vect_can_vectorize_without_simd_p (op.code)) > >> - ok = false; > >> + single_defuse_cycle = false; > >> else > >> if (dump_enabled_p ()) > >> dump_printf (MSG_NOTE, "proceeding using word mode.\n"); > >> @@ -8323,35 +8473,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> dump_printf (MSG_NOTE, "using word mode not possible.\n"); > >> return false; > >> } > >> - > >> - /* lane-reducing operations have to go through > >> vect_transform_reduction. > >> - For the other cases try without the single cycle optimization. > >> */ > >> - if (!ok) > >> - { > >> - if (lane_reducing) > >> - return false; > >> - else > >> - single_defuse_cycle = false; > >> - } > >> } > >> STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle; > >> > >> - /* If the reduction stmt is one of the patterns that have lane > >> - reduction embedded we cannot handle the case of ! > >> single_defuse_cycle. */ > >> - if ((ncopies > 1 && ! single_defuse_cycle) > >> - && lane_reducing) > >> - { > >> - if (dump_enabled_p ()) > >> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > >> - "multi def-use cycle not possible for > >> lane-reducing " > >> - "reduction operation\n"); > >> - return false; > >> - } > >> - > >> - if (slp_node > >> - && !(!single_defuse_cycle > >> - && !lane_reducing > >> - && reduction_type != FOLD_LEFT_REDUCTION)) > >> + /* Reduction type of lane-reducing operation is TREE_CODE_REDUCTION, the > >> + below processing will be done in its own vectorizable function. */ > >> + if (slp_node && reduction_type == FOLD_LEFT_REDUCTION) > >> for (i = 0; i < (int) op.num_ops; i++) > >> if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i])) > >> { > >> @@ -8364,28 +8491,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >> vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn, > >> reduction_type, ncopies, cost_vec); > >> /* Cost the reduction op inside the loop if transformed via > >> - vect_transform_reduction. Otherwise this is costed by the > >> - separate vectorizable_* routines. */ > >> - if (single_defuse_cycle || lane_reducing) > >> - { > >> - int factor = 1; > >> - if (vect_is_emulated_mixed_dot_prod (stmt_info)) > >> - /* Three dot-products and a subtraction. */ > >> - factor = 4; > >> - record_stmt_cost (cost_vec, ncopies * factor, vector_stmt, > >> - stmt_info, 0, vect_body); > >> - } > >> + vect_transform_reduction for non-lane-reducing operation. Otherwise > >> + this is costed by the separate vectorizable_* routines. */ > >> + if (single_defuse_cycle && !lane_reducing) > >> + record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, > >> vect_body); > >> > >> if (dump_enabled_p () > >> && reduction_type == FOLD_LEFT_REDUCTION) > >> dump_printf_loc (MSG_NOTE, vect_location, > >> "using an in-order (fold-left) reduction.\n"); > >> STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type; > >> - /* All but single defuse-cycle optimized, lane-reducing and fold-left > >> - reductions go through their own vectorizable_* routines. */ > >> - if (!single_defuse_cycle > >> - && !lane_reducing > >> - && reduction_type != FOLD_LEFT_REDUCTION) > >> + > >> + /* All but single defuse-cycle optimized and fold-left reductions go > >> + through their own vectorizable_* routines. */ > >> + if ((!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION) > >> + || lane_reducing) > > > > > So single-def-use-cycle but lane-reducing ops no longer need > > to go through vect_transform_reduction? How do you handle those > > but fail to handle non-lane-reducing ops this way? > > Emm, all kinds of lane-reducing ops will go into vectorizable_lane_reducing(), > no matter it is single-def-use or not, at that function, the STMT_VINFO_TYPE > is set to reduc_vec_info_type, so transform will be done inside > vect_transform_reduction. > > > > >> { > >> stmt_vec_info tem > >> = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info)); > >> @@ -8490,6 +8610,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > >> class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > >> int i; > >> int ncopies; > >> + int stmt_ncopies; > >> int vec_num; > >> > >> stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info); > >> @@ -8513,15 +8634,28 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > >> gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt); > >> int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info); > >> tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info); > >> + tree stmt_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info); > >> + > >> + /* Get input vectypes from the reduction PHI and the statement to be > >> + transformed, these two vectypes may have different lanes when > >> + lane-reducing operation is present. */ > >> + if (!vectype_in) > >> + vectype_in = STMT_VINFO_REDUC_VECTYPE (reduc_info); > >> + > >> + if (!stmt_vectype_in) > >> + stmt_vectype_in = STMT_VINFO_VECTYPE (stmt_info); > >> > >> if (slp_node) > >> { > >> ncopies = 1; > >> + stmt_ncopies = 1; > >> vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); > >> } > >> else > >> { > >> ncopies = vect_get_num_copies (loop_vinfo, vectype_in); > >> + stmt_ncopies = vect_get_num_copies (loop_vinfo, stmt_vectype_in); > >> + gcc_assert (stmt_ncopies >= 1 && stmt_ncopies <= ncopies); > >> vec_num = 1; > >> } > >> > >> @@ -8530,14 +8664,10 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > >> > >> vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); > >> vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); > >> - bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, > >> vectype_in); > >> - > >> + bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, > >> + stmt_vectype_in); > >> /* Transform. */ > >> - tree new_temp = NULL_TREE; > >> - auto_vec<tree> vec_oprnds0; > >> - auto_vec<tree> vec_oprnds1; > >> - auto_vec<tree> vec_oprnds2; > >> - tree def0; > >> + auto_vec<tree> vec_oprnds[3]; > >> > >> if (dump_enabled_p ()) > >> dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); > >> @@ -8561,8 +8691,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > >> == op.ops[internal_fn_else_index ((internal_fn) > >> code)])); > >> } > >> > >> - bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); > >> - > >> vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); > >> if (reduction_type == FOLD_LEFT_REDUCTION) > >> { > >> @@ -8570,7 +8698,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > >> gcc_assert (code.is_tree_code () || cond_fn_p); > >> return vectorize_fold_left_reduction > >> (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, > >> - code, reduc_fn, op.ops, op.num_ops, vectype_in, > >> + code, reduc_fn, op.ops, op.num_ops, stmt_vectype_in, > >> reduc_index, masks, lens); > >> } > >> > >> @@ -8581,55 +8709,121 @@ vect_transform_reduction (loop_vec_info > >> loop_vinfo, > >> tree scalar_dest = gimple_get_lhs (stmt_info->stmt); > >> tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out); > >> > >> - /* Get NCOPIES vector definitions for all operands except the reduction > >> - definition. */ > >> - if (!cond_fn_p) > >> + gcc_assert (reduc_index < 3); > >> + > >> + if (slp_node) > >> { > >> - vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, > >> - single_defuse_cycle && reduc_index == 0 > >> - ? NULL_TREE : op.ops[0], &vec_oprnds0, > >> - single_defuse_cycle && reduc_index == 1 > >> - ? NULL_TREE : op.ops[1], &vec_oprnds1, > >> - op.num_ops == 3 > >> - && !(single_defuse_cycle && reduc_index == 2) > >> - ? op.ops[2] : NULL_TREE, &vec_oprnds2); > >> + gcc_assert (!single_defuse_cycle && op.num_ops <= 3); > > > > I think that's going to fail. Mind v3 of the series I posted to enable > > SLP discovery for single-lane reductions. Basically everything is > > going to be SLP for GCC 15. > > > > Have the v3 already been in the trunk? Then by default, any statement that has > no isomorphic partner will become a single-lane SLP node? And for such node, > can I just reuse the old non-SLP transformation code?
As of this morning, r15-1006-gd93353e6423eca, it is on trunk. Note the fallback is still non-SLP in case vectorizable_reduction FAILs with SLP. I have a set of changes queued to allow some more kind of reductions with SLP but IIRC the lane-reducing variant is already supported. Richard. > >> + > >> + for (i = 0; i < (int) op.num_ops; i++) > >> + vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], > >> &vec_oprnds[i]); > >> } > >> else > >> { > >> - /* For a conditional operation pass the truth type as mask > >> - vectype. */ > >> - gcc_assert (single_defuse_cycle > >> - && (reduc_index == 1 || reduc_index == 2)); > >> - vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, > >> - op.ops[0], truth_type_for (vectype_in), > >> &vec_oprnds0, > >> - reduc_index == 1 ? NULL_TREE : op.ops[1], > >> - NULL_TREE, &vec_oprnds1, > >> - reduc_index == 2 ? NULL_TREE : op.ops[2], > >> - NULL_TREE, &vec_oprnds2); > >> - } > >> + /* The input vectype of the reduction PHI determines copies of > >> + vectorized def-use cycles, which might be more than effective > >> copies > >> + of vectorized lane-reducing reduction statements. This could be > >> + complemented by generating extra trivial pass-through copies. For > >> + example: > >> + > > > > That also means you need to handle SLP here, but you can assert there's > > only a single lane. > > > > Btw, you can push the patches I approved if they independently test OK. > > > > >> + int sum = 0; > >> + for (i) > >> + { > >> + sum += d0[i] * d1[i]; // dot-prod <vector(16) char> > >> + sum += abs(s0[i] - s1[i]); // sad <vector(8) short> > >> + sum += n[i]; // normal <vector(4) int> > >> + } > >> + > >> + The vector size is 128-bit,vectorization factor is 16. Reduction > >> + statements would be transformed as: > >> + > >> + vector<4> int sum_v0 = { 0, 0, 0, 0 }; > >> + vector<4> int sum_v1 = { 0, 0, 0, 0 }; > >> + vector<4> int sum_v2 = { 0, 0, 0, 0 }; > >> + vector<4> int sum_v3 = { 0, 0, 0, 0 }; > >> + > >> + for (i / 16) > >> + { > >> + sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], > >> sum_v0); > >> + sum_v1 = sum_v1; // copy > >> + sum_v2 = sum_v2; // copy > >> + sum_v3 = sum_v3; // copy > >> + > >> + sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0); > >> + sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1); > >> + sum_v2 = sum_v2; // copy > >> + sum_v3 = sum_v3; // copy > >> + > >> + sum_v0 += n_v0[i: 0 ~ 3 ]; > >> + sum_v1 += n_v1[i: 4 ~ 7 ]; > >> + sum_v2 += n_v2[i: 8 ~ 11]; > >> + sum_v3 += n_v3[i: 12 ~ 15]; > >> + } > >> + */ > >> + > >> + for (i = 0; i < MIN (3, (int) op.num_ops); i++) > >> + { > >> + tree vectype = NULL_TREE; > >> + int used_ncopies = ncopies; > >> + > >> + if (cond_fn_p && i == 0) > >> + { > >> + /* For a conditional operation pass the truth type as mask > >> + vectype. */ > >> + gcc_assert (single_defuse_cycle && reduc_index > 0); > >> + vectype = truth_type_for (vectype_in); > >> + } > >> > >> - /* For single def-use cycles get one copy of the vectorized reduction > >> - definition. */ > >> - if (single_defuse_cycle) > >> - { > >> - gcc_assert (!slp_node); > >> - vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1, > >> - op.ops[reduc_index], > >> - reduc_index == 0 ? &vec_oprnds0 > >> - : (reduc_index == 1 ? &vec_oprnds1 > >> - : &vec_oprnds2)); > >> + if (i != reduc_index) > >> + { > >> + /* For non-reduction operand, deduce effictive copies that > >> are > >> + involved in vectorized def-use cycles based on the input > >> + vectype of the reduction statement. */ > >> + used_ncopies = stmt_ncopies; > >> + } > >> + else if (single_defuse_cycle) > >> + { > >> + /* For single def-use cycles get one copy of the vectorized > >> + reduction definition. */ > >> + used_ncopies = 1; > >> + } > >> + > >> + vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, > >> used_ncopies, > >> + op.ops[i], &vec_oprnds[i], > >> vectype); > >> + > >> + if (used_ncopies < ncopies) > >> + vec_oprnds[i].safe_grow_cleared (ncopies); > >> + } > >> } > >> > >> + bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); > >> bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod > >> (stmt_info); > >> + tree def0; > >> > >> - FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) > >> + FOR_EACH_VEC_ELT (vec_oprnds[0], i, def0) > >> { > >> gimple *new_stmt; > >> - tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; > >> - if (masked_loop_p && !mask_by_cond_expr) > >> + tree new_temp = NULL_TREE; > >> + tree vop[3] = { def0, vec_oprnds[1][i], NULL_TREE }; > >> + > >> + if (!vop[0] || !vop[1]) > >> + { > >> + tree reduc_vop = vec_oprnds[reduc_index][i]; > >> + > >> + /* Insert trivial copy if no need to generate vectorized > >> + statement. */ > >> + gcc_assert (reduc_vop && stmt_ncopies < ncopies); > >> + > >> + new_stmt = gimple_build_assign (vec_dest, reduc_vop); > >> + new_temp = make_ssa_name (vec_dest, new_stmt); > >> + gimple_set_lhs (new_stmt, new_temp); > >> + vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, > >> gsi); > >> + } > >> + else if (masked_loop_p && !mask_by_cond_expr) > >> { > >> - /* No conditional ifns have been defined for dot-product yet. */ > >> - gcc_assert (code != DOT_PROD_EXPR); > >> + /* No conditional ifns have been defined for dot-product and sad > >> + yet. */ > >> + gcc_assert (code != DOT_PROD_EXPR && code != SAD_EXPR); > >> > >> /* Make sure that the reduction accumulator is vop[0]. */ > >> if (reduc_index == 1) > >> @@ -8638,7 +8832,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > >> std::swap (vop[0], vop[1]); > >> } > >> tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, > >> - vec_num * ncopies, vectype_in, > >> i); > >> + vec_num * stmt_ncopies, > >> + stmt_vectype_in, i); > >> gcall *call = gimple_build_call_internal (cond_fn, 4, mask, > >> vop[0], vop[1], > >> vop[0]); > >> new_temp = make_ssa_name (vec_dest, call); > >> @@ -8650,12 +8845,13 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > >> else > >> { > >> if (op.num_ops >= 3) > >> - vop[2] = vec_oprnds2[i]; > >> + vop[2] = vec_oprnds[2][i]; > >> > >> if (masked_loop_p && mask_by_cond_expr) > >> { > >> tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, > >> - vec_num * ncopies, > >> vectype_in, i); > >> + vec_num * stmt_ncopies, > >> + stmt_vectype_in, i); > >> build_vect_cond_expr (code, vop, mask, gsi); > >> } > >> > >> @@ -8682,16 +8878,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo, > >> > >> if (slp_node) > >> slp_node->push_vec_def (new_stmt); > >> - else if (single_defuse_cycle > >> - && i < ncopies - 1) > >> - { > >> - if (reduc_index == 0) > >> - vec_oprnds0.safe_push (gimple_get_lhs (new_stmt)); > >> - else if (reduc_index == 1) > >> - vec_oprnds1.safe_push (gimple_get_lhs (new_stmt)); > >> - else if (reduc_index == 2) > >> - vec_oprnds2.safe_push (gimple_get_lhs (new_stmt)); > >> - } > >> + else if (single_defuse_cycle && i < ncopies - 1) > >> + vec_oprnds[reduc_index][i + 1] = gimple_get_lhs (new_stmt); > >> else > >> STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); > >> } > >> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > >> index 2e0be763abb..cc0a832f71b 100644 > >> --- a/gcc/tree-vect-stmts.cc > >> +++ b/gcc/tree-vect-stmts.cc > >> @@ -13296,6 +13296,8 @@ vect_analyze_stmt (vec_info *vinfo, > >> NULL, NULL, node, cost_vec) > >> || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, > >> cost_vec) > >> || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, > >> cost_vec) > >> + || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo), > >> + stmt_info, node, cost_vec) > >> || vectorizable_reduction (as_a <loop_vec_info> (vinfo), > >> stmt_info, > >> node, node_instance, cost_vec) > >> || vectorizable_induction (as_a <loop_vec_info> (vinfo), > >> stmt_info, > >> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > >> index 97ec9c341e7..ca810869592 100644 > >> --- a/gcc/tree-vectorizer.h > >> +++ b/gcc/tree-vectorizer.h > >> @@ -2443,6 +2443,8 @@ extern loop_vec_info vect_create_loop_vinfo (class > >> loop *, vec_info_shared *, > >> extern bool vectorizable_live_operation (vec_info *, stmt_vec_info, > >> slp_tree, slp_instance, int, > >> bool, stmt_vector_for_cost *); > >> +extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info, > >> + slp_tree, stmt_vector_for_cost *); > >> extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info, > >> slp_tree, slp_instance, > >> stmt_vector_for_cost *); > >> -- > >> 2.17.1